package mobvista.dmp.main

import java.net.URI

import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.format.MultipleOrcOutputFormat
import mobvista.dmp.main.Constant._
import org.apache.commons.cli.{BasicParser, Options}
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.Text
import org.apache.orc.mapred.OrcStruct
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

/**
  * @desc
  * 1、join 未匹配数据和 bundle_bundle_mapping 数据
  * 2、查找未匹配
  * @author wangjf
  * @date 2020-04-20 17:23:17
  */
abstract class BundleMatchNewJob extends CommonSparkJob with Serializable {
  val dataSplit = "\t"
  val packageRegex = "^[0-9]+$"

  def run(args: Array[String]): Int = {
    val options = buildOptions()
    val parser = new BasicParser
    val commandLine = parser.parse(options, args)

    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return 1
    } else {
      printOptions(commandLine)
    }

    val coalesce = Integer.parseInt(commandLine.getOptionValue("coalesce"))
    val date = commandLine.getOptionValue("date")
    val business = commandLine.getOptionValue("business")
    val input = commandLine.getOptionValue("input")
    val output = commandLine.getOptionValue("output")
    val bundlePkgPath = commandLine.getOptionValue("bundlePkgPath")
    val unmatchInputPath = commandLine.getOptionValue("unmatchInputPath")
    val unmatchOutputPath = commandLine.getOptionValue("unmatchOutputPath")

    val spark = mobvista.dmp.common.MobvistaConstant.createSparkSession(s"BundleMatchNewJob.$business")

    val sc = spark.sparkContext

    FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)
    FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(unmatchOutputPath), true)

    try {
      //  字典表
      val bundlePkgData = sc.textFile(bundlePkgPath)
      val bundleInfoMap = bundlePkgData.map(record => {
        val splits = StringUtils.splitPreserveAllTokens(record, dataSplit, -1)
        (splits(0), splits(1))
      }).collectAsMap()
      val bundleBC = sc.broadcast(bundleInfoMap)

      val schema = getSchema(business)
      val rdd: RDD[(Text, OrcStruct)] = processData(business, date, input, output, unmatchInputPath, unmatchOutputPath, spark, bundleBC, coalesce)
        .rdd
        .mapPartitions(v => new CustomIterator(schema, v, output, unmatchOutputPath))

      rdd.saveAsNewAPIHadoopFile(output, classOf[Text], classOf[OrcStruct], classOf[MultipleOrcOutputFormat[Text, OrcStruct]],
        initConfig(business, sc.hadoopConfiguration))

    } finally {
      if (spark != null) {
        spark.stop()
      }
      if (sc != null) {
        sc.stop()
      }
    }
    0
  }

  override def buildOptions(): Options = {
    val options = new Options
    options.addOption("coalesce", true, "[must] coalesce")
    options.addOption("date", true, "[must] date")
    options.addOption("business", true, "[must] business")
    options.addOption("input", true, "[must] input path")
    options.addOption("output", true, "[must] output path")
    options.addOption("bundlePkgPath", true, "[must] ibundlePkgPath")
    options.addOption("unmatchInputPath", true, "[must] unmatchOutputPath")
    options.addOption("unmatchOutputPath", true, "[must] unmatchOutputPath")
    options
  }

  def processData(business: String, date: String, input: String, output: String, oldUnmatch: String, unMatchOutput: String, spark: SparkSession,
                  bundleBC: Broadcast[scala.collection.Map[String, String]], coalesce: Int): DataFrame
}