package mobvista.dmp.main

import java.net.URI
import java.util

import mobvista.dmp.util.DelayUtil
import mobvista.prd.datasource.util.GsonUtil
import org.apache.commons.cli.{BasicParser, CommandLine, HelpFormatter, Options}
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer

/**
  * 处理DSP数据
  * 1、join 未匹配数据和bundle_bundle_mapping数据
  * 2、查找未匹配
  * Created by fl on 2017/7/18.
  */
class DSPBundleMatchJob extends Serializable {
  val dataSplit = "\t"
  val regex = "^[0-9]+$"

  def run(args: Array[String]): Int = {
    var sc: SparkContext = null

    try {
      val options = buildOptions()
      val parser = new BasicParser
      val commandLine = parser.parse(options, args)

      if (!checkMustOption(commandLine)) {
        printUsage(options)
        return 1
      }

      val input = commandLine.getOptionValue("input")
      val output = commandLine.getOptionValue("output")
      val pfIndex = commandLine.getOptionValue("pfIndex").toInt
      val pkgIndex = commandLine.getOptionValue("pkgIndex").toInt
      val bundlePkgPath = commandLine.getOptionValue("bundlePkgPath")
      val unmatchInputPath = commandLine.getOptionValue("unmatchInputPath")
      val unmatchOutputPath = commandLine.getOptionValue("unmatchOutputPath")
      val parallelism = commandLine.getOptionValue("parallelism").toInt
      val coalesce = commandLine.getOptionValue("coalesce").toInt

      println("*************************")
      println(s"* input = $input")
      println(s"* output = $output")
      println(s"* pfIndex = $pfIndex")
      println(s"* pkgIndex = $pkgIndex")
      println(s"* bundlePkgPath = $bundlePkgPath")
      println(s"* unmatchInputPath = $unmatchInputPath")
      println(s"* unmatchOutputPath = $unmatchOutputPath")
      println("*************************")

      val conf = new SparkConf()
        .setAppName("BundleMatchJob")
        .set("spark.rdd.compress", "true")
        .set("spark.io.compression.codec", "snappy")
        .set("spark.default.parallelism", s"$parallelism")
        .set("spark.sql.orc.filterPushdown", "true")
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      sc = new SparkContext(conf)

      // 字典表
      val bundlePkgData = sc.textFile(bundlePkgPath)
      val bundleInfoMap = bundlePkgData.map(record => {
        val splits = StringUtils.splitPreserveAllTokens(record, dataSplit, -1)
        (splits(0), splits(1)) // (bundleID, iospackage)
      }).collectAsMap()
      val bundleBC = sc.broadcast(bundleInfoMap)

      // 在曾经未匹配数据中找出能够匹配数据，该部分数据将加入到input数据中
      var oldValidData: RDD[String] = null
      var matchData: RDD[String] = null
      var oldUnatchData: RDD[String] = null
      if (StringUtils.isNotEmpty(unmatchInputPath)) {
        val unmatchData = sc.textFile(unmatchInputPath)
        matchData = unmatchData.flatMap(matchPackage(_, pfIndex, pkgIndex, bundleBC))
        oldValidData = matchData.filter(filterMatchData(_, pfIndex, pkgIndex, false))
        oldUnatchData = matchData.filter(filterMatchData(_, pfIndex, pkgIndex, true))
      }

      val dayData = sc.textFile(input)
      val androidData = dayData.filter(isPlatformData(_, "android", pfIndex))
      val iosData = dayData.filter(isPlatformData(_, "ios", pfIndex))
      val dayMatchData = iosData.flatMap(matchPackage(_, pfIndex, pkgIndex, bundleBC))

      // 获取非bundle数据
      val dayValidData = dayMatchData.filter(filterMatchData(_, pfIndex, pkgIndex, false))

      // 获取bundle数据
      val dayUnMatchData = dayMatchData.filter(filterMatchData(_, pfIndex, pkgIndex, true))

      // 写出天数据
      var iosResultRDD: RDD[String] = null
      if (oldValidData != null) {
        iosResultRDD = dayValidData.union(oldValidData)
      } else {
        iosResultRDD = dayValidData
      }
      FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)
      combinePackage(iosResultRDD, pfIndex, pkgIndex).union(androidData).saveAsTextFile(output, classOf[GzipCodec])

      // 写出未匹配数据
      FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(unmatchOutputPath),
        true)
      if (oldUnatchData != null) {
        iosResultRDD = dayUnMatchData.union(oldUnatchData)
      } else {
        iosResultRDD = dayUnMatchData
      }
      val resultRDD = combinePackage(iosResultRDD, pfIndex, pkgIndex)
      if (!resultRDD.isEmpty()) {
        resultRDD.coalesce(coalesce.toInt, true).saveAsTextFile(unmatchOutputPath, classOf[GzipCodec])
      }
    } finally {
      if (sc != null) {
        sc.stop()
      }
    }
    0
  }

  /**
    * 以device_id和device_type为唯一key，合并包名为['pkg1', 'pkg2']
    *
    * @param iosData
    * @param pfIndex
    * @param pkgIndex
    * @return
    */
  def combinePackage(iosData: RDD[String], pfIndex: Int, pkgIndex: Int): RDD[String] = {
    val tupleData = iosData.map(record => {
      val splits = StringUtils.splitPreserveAllTokens(record, dataSplit, -1)
      val device_id = splits(0)
      val device_type = splits(1)
      (s"$device_id$dataSplit$device_type", record)
    })

    val resultIosData = tupleData.aggregateByKey("")((U: String, C: String) => {
      var result: Array[String] = null
      val set = new util.HashSet[String]()
      if (StringUtils.isNotEmpty(U)) {
        val splits = StringUtils.splitPreserveAllTokens(U, dataSplit, -1)
        val package1 = splits(pkgIndex)
        val array = GsonUtil.String2JsonArray(package1)
        set.addAll(GsonUtil.fromJson(array, classOf[util.HashSet[String]]))
        result = splits
      }

      val splits1 = StringUtils.splitPreserveAllTokens(C, dataSplit, -1)
      set.add(splits1(pkgIndex))
      if (result == null) {
        result = splits1
      }

      result(pkgIndex) = GsonUtil.toJson(set)
      result.mkString(dataSplit)
    }, (t1, t2) => {
      var result: Array[String] = null
      val set = new util.HashSet[String]()
      if (StringUtils.isNotEmpty(t1)) {
        val splits = StringUtils.splitPreserveAllTokens(t1, dataSplit, -1)
        val package1 = splits(pkgIndex)
        val array = GsonUtil.String2JsonArray(package1)
        set.addAll(GsonUtil.fromJson(array, classOf[util.HashSet[String]]))
        result = splits
      }

      val splits1 = StringUtils.splitPreserveAllTokens(t2, dataSplit, -1)
      val package2 = splits1(pkgIndex)
      val jsonArray = GsonUtil.String2JsonArray(package2)
      set.addAll(GsonUtil.fromJson(jsonArray, classOf[util.HashSet[String]]))
      if (result == null) {
        result = splits1
      }

      result(pkgIndex) = GsonUtil.toJson(set)
      result.mkString(dataSplit)
    }).map(_._2)

    resultIosData
  }


  def isPlatformData(record: String, platform: String, pfIndex: Int): Boolean = {
    val splits = StringUtils.splitPreserveAllTokens(record, dataSplit, -1)
    if(splits.length<=pfIndex){return false}
    platform.equals(splits(pfIndex))
  }

  /**
    * 选定bundle和非bundle数据
    *
    * @param record
    * @param pfIndex
    * @param pkgIndex
    * @param flag true表示过滤bundle数据，false表示过滤非bundle数据
    * @return
    */
  def filterMatchData(record: String, pfIndex: Int, pkgIndex: Int, flag: Boolean): Boolean = {
    val splits = StringUtils.splitPreserveAllTokens(record, dataSplit, -1)
    if (pfIndex < splits.length && pkgIndex < splits.length) {
      val platform = splits(pfIndex)
      val packageName = splits(pkgIndex)
      if (flag) {
        isBundleId(platform, packageName)
      } else {
        !isBundleId(platform, packageName)
      }
    } else {
      false
    }
  }

  /**
    * 判定包名是否为bundleId
    *
    * @param platform
    * @param packageName
    * @return
    */
  def isBundleId(platform: String, packageName: String): Boolean = {
    if ("ios".equals(platform) && !packageName.startsWith("id") && !packageName.matches(regex)) {
      true
    } else {
      false
    }
  }

  /**
    * 将ios数据拆分成多条数据，每条数据拆分条数 = 原数据中packagelist中包含的包个数，
    * android数据直接输出，不做任何处理
    *
    * @param record
    * @param pfIndex
    * @param pkgIndex
    * @param bundleBC
    * @return
    */
  def matchPackage(record: String, pfIndex: Int, pkgIndex: Int, bundleBC: Broadcast[scala.collection.Map[String, String]]): Array[String] = {
    val result = new ArrayBuffer[String]()
    val splits = StringUtils.splitPreserveAllTokens(record, dataSplit, -1)
    if (pfIndex < splits.length && pkgIndex < splits.length) {
      val platform = splits(pfIndex)
      val packages = splits(pkgIndex)
      val jsonArray = GsonUtil.String2JsonArray(packages)
      val packageList = GsonUtil.fromJson(jsonArray, classOf[java.util.List[String]])
      for (packageName <- packageList) {
        if (isBundleId(platform, packageName)) {
          val tempPkgName = bundleBC.value.get(packageName)
          if (tempPkgName != None) {
            splits(pkgIndex) = tempPkgName.get
          } else {
            splits(pkgIndex) = packageName
          }
        } else {
          splits(pkgIndex) = packageName
        }
        result += splits.mkString(dataSplit)
      }
    }
    result.toArray
  }

  def checkMustOption(commands: CommandLine): Boolean = {
    if (!commands.hasOption("input")) {
      println("please set input ")
      return false
    }
    if (!commands.hasOption("output")) {
      println("please set output ")
      return false
    }
    true
  }

  def printUsage(options: Options): Unit = {
    val help = new HelpFormatter
    help.printHelp(this.getClass.getSimpleName, options)
  }

  def buildOptions(): Options = {
    val options = new Options
    options.addOption("input", true, "[must] input path")
    options.addOption("output", true, "[must] output path")
    options.addOption("pfIndex", true, "[must] index of platform column")
    options.addOption("pkgIndex", true, "[must] index of package column")
    options.addOption("bundlePkgPath", true, "[must] ibundlePkgPath")
    options.addOption("unmatchInputPath", true, "[must] unmatchOutputPath")
    options.addOption("unmatchOutputPath", true, "[must] unmatchOutputPath")
    options.addOption("parallelism", true, "parallelism of shuffle operation")
    options.addOption("coalesce", true, "number of output files")
    options
  }
}

object DSPBundleMatchJob {
  def main(args: Array[String]): Unit = {
    val du = new DelayUtil()
    try {
      du.setStart(System.currentTimeMillis())
      du.printStart("Start DSPBundleMatchJob")
      new DSPBundleMatchJob().run(args)
    } finally {
      du.setEnd(System.currentTimeMillis())
      du.printEnd("Finish DSPBundleMatchJob")
    }
  }
}
