package mobvista.dmp.main

import java.net.URI

import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.format.RDDMultipleOutputFormat
import mobvista.dmp.util.DelayUtil
import org.apache.commons.cli.{BasicParser, Options}
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.{CompressionCodec, GzipCodec}
import org.apache.hadoop.io.{SequenceFile, Text}
import org.apache.spark.{SparkConf, SparkContext}

/**
  *
  * 1、join 未匹配数据和bundle_bundle_mapping数据
  * 2、查找未匹配
  * Created by fl on 2017/7/18.
  */
class BundleMatchJob extends CommonSparkJob with Serializable {
  val dataSplit = "\t"
  val packageRegex = "^[0-9]+$"

  def run(args: Array[String]): Int = {
    var sc: SparkContext = null

    try {
      val options = buildOptions()
      val parser = new BasicParser
      val commandLine = parser.parse(options, args)

      if (!checkMustOption(commandLine)) {
        printUsage(options)
        return 1
      } else {
        printOptions(commandLine)
      }

      val input = commandLine.getOptionValue("input")
      val output = commandLine.getOptionValue("output")
      val pfIndex = commandLine.getOptionValue("pfIndex").toInt
      val pkgIndex = commandLine.getOptionValue("pkgIndex").toInt
      val bundlePkgPath = commandLine.getOptionValue("bundlePkgPath")
      val unmatchInputPath = commandLine.getOptionValue("unmatchInputPath")
      val unmatchOutputPath = commandLine.getOptionValue("unmatchOutputPath")
      val parallelism = commandLine.getOptionValue("parallelism").toInt
      val coalesce = commandLine.getOptionValue("coalesce").toInt

      val conf = new SparkConf()
        .setAppName("BundleMatchJob")
        .set("spark.rdd.compress", "true")
        .set("spark.io.compression.codec", "snappy")
        .set("spark.default.parallelism", s"$parallelism")
        .set("spark.sql.orc.filterPushdown", "true")
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      sc = new SparkContext(conf)

      FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)

      // 字典表
      val bundlePkgData = sc.textFile(bundlePkgPath)
      val bundleInfoMap = bundlePkgData.map(record => {
        val splits = StringUtils.splitPreserveAllTokens(record, dataSplit, -1)
        (splits(0), splits(1))
      }).collectAsMap()
      val bundleBC = sc.broadcast(bundleInfoMap)

      val inputData = sc.textFile(input)
      val oldUnmatch = sc.textFile(unmatchInputPath).coalesce(coalesce)

      inputData.union(oldUnmatch)
        .map(splitFun)
        .filter(array => {
          StringUtils.isNotEmpty(array(pkgIndex))
        })
        .map(array => {
          var outPath = output
          val platform = array(pfIndex)
          val packageName = array(pkgIndex)
          if ("ios".equals(platform) && !packageName.matches(packageRegex)) {
            val tmp = packageName.replace("id", "")
            if (tmp.matches(packageRegex)) {
              array(pkgIndex) = tmp
            } else {
              val matchPackage = bundleBC.value.get(packageName)
              if (matchPackage.isDefined) {
                array(pkgIndex) = matchPackage.get
              } else {
                outPath = unmatchOutputPath
              }
            }
          }
          (new Text(s"$outPath"), new Text(array.mkString(DATA_SPLIT)))
        }).repartition(coalesce).saveAsNewAPIHadoopFile(output, classOf[Text], classOf[Text], classOf[RDDMultipleOutputFormat[_, _]], getCompressConf(sc))

    } finally {
      if (sc != null) {
        sc.stop()
      }
    }
    0
  }

  /**
    *
    * @param sc
    * @return
    */
  def getCompressConf(sc: SparkContext): Configuration = {
    val conf = sc.hadoopConfiguration
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true)
    conf.set("mapreduce.output.fileoutputformat.compress.type", SequenceFile.CompressionType.BLOCK.toString)
    conf.setClass("mapreduce.output.fileoutputformat.compress.codec", classOf[GzipCodec], classOf[CompressionCodec])
    conf
  }


  override
  def buildOptions(): Options = {
    val options = new Options
    options.addOption("input", true, "[must] input path")
    options.addOption("output", true, "[must] output path")
    options.addOption("pfIndex", true, "[must] index of platform column")
    options.addOption("pkgIndex", true, "[must] index of package column")
    options.addOption("bundlePkgPath", true, "[must] ibundlePkgPath")
    options.addOption("unmatchInputPath", true, "[must] unmatchOutputPath")
    options.addOption("unmatchOutputPath", true, "[must] unmatchOutputPath")
    options.addOption("parallelism", true, "parallelism of shuffle operation")
    options.addOption("coalesce", true, "number of output files")
    options
  }
}

object BundleMatchJob {
  def main(args: Array[String]): Unit = {
    val du = new DelayUtil()
    try {
      du.setStart(System.currentTimeMillis())
      du.printStart("Start BundleMatchJob")
      new BundleMatchJob().run(args)
    } finally {
      du.setEnd(System.currentTimeMillis())
      du.printEnd("Finish BundleMatchJob")
    }
  }
}
