package mobvista.dmp.main

import mobvista.dmp.common.CommonSparkJob
import org.apache.commons.cli.Options
import org.apache.spark.sql.SparkSession

/**
  * 对adn和dsp 请求日志做聚合，方便数据统计
  */
class MdsRequestCluster extends CommonSparkJob with Serializable {
  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
    } else {
      printOptions(commandLine)
    }


    val sql = commandLine.getOptionValue("sql")
    val output = commandLine.getOptionValue("output")
    val parallelism = commandLine.getOptionValue("parallelism").toInt
    val coalesce = commandLine.getOptionValue("coalesce").toInt

    val spark = SparkSession
      .builder()
      .appName("MdsRequestCluster")
      .config("spark.rdd.compress", "true")
      .config("spark.sql.orc.enabled", "true")
      .config("spark.default.parallelism", s"${parallelism}")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    try {
      spark.sql(sql)
        .write
        .option("orc.compress", "zlib")
        .orc(output)
    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("sql", true, "[must] sql")
    options.addOption("output", true, "[must] output path")
    options.addOption("parallelism", true, "parallelism of shuffle operation")
    options.addOption("coalesce", true, "number of output files")
    options
  }
}

object MdsRequestCluster {
  def main(args: Array[String]): Unit = {
    new MdsRequestCluster().run(args)
  }
}
