package mobvista.dmp.main

import java.net.URI
import java.util

import mobvista.prd.datasource.util.GsonUtil
import org.apache.commons.cli.{BasicParser, CommandLine, HelpFormatter, Options}
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, TextInputFormat}
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

/**
  * 找出所有未匹配的bundleID
  */
class FindUnmatchBundle extends Serializable{
  val dataSplit = "\t"

  // <模块名, 包名列index>
  val config = new mutable.HashMap[String, Int]()
  config.put("3s", 3)
  config.put("adserver", 3)
  config.put("adn_install", 5)
  config.put("adn_request", 4)
  config.put("dsp", 10)
  config.put("ga", 3)

  def run (args : Array[String]) : Int = {
    var sc : SparkContext = null
    try {
      val options = buildOptions()
      val parser = new BasicParser
      val commandLine = parser.parse(options, args)

      if (!checkMustOption(commandLine)) {
        printUsage(options)
        return 1
      }

      val input = commandLine.getOptionValue("input")
      val output = commandLine.getOptionValue("output")

      println("****************************************")
      println(s"* input = $input")
      println(s"* output $output")
      println("****************************************")

      val conf = new SparkConf().setAppName("FindUnmatchBundle")
      sc = new SparkContext(conf)
      val data = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](input)
      val hadoopRDD = data.asInstanceOf[NewHadoopRDD[LongWritable, Text]]

      val packageData = hadoopRDD.mapPartitionsWithInputSplit[String]((inputSplit, itr) => {
        val split = inputSplit.asInstanceOf[FileSplit]
        val path = split.getPath.getParent.getName
        val index : Integer = path match {
          case "3s" => 3
          case "adserver" => 3
          case "adn_install" => 5
          case "adn_request" => 4
          case "dsp" => 10
          case "ga" => 3
          case _ => null
        }

        var result : Iterator[String] = null
        if (index != null) {
          result = itr.flatMap(tuple => {
            val value = tuple._2.toString
            val splits = StringUtils.splitPreserveAllTokens(value, dataSplit, -1)
            val packageName = splits(index)
            val set = new util.HashSet[String]()
            if (!"dsp".equals(path)) {
              set.add(packageName)
            } else {
              val jsonArray = GsonUtil.String2JsonArray(packageName)
              set.addAll(GsonUtil.fromJson(jsonArray, classOf[util.HashSet[String]]))
            }
            set.toArray(Array[String]())
          }).toIterator

        }
        result
      })
      FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output),true)
      packageData.distinct().coalesce(5, true).saveAsTextFile(output)
    } finally {
      if (sc != null) {
        sc.stop()
      }
    }
    0
  }


  def checkMustOption(commands: CommandLine): Boolean = {
    if (!commands.hasOption("input")) {
      println("please set input ")
      return false
    }
    if (!commands.hasOption("output")) {
      println("please set output ")
      return false
    }
    true
  }

  def printUsage(options: Options): Unit = {
    val help = new HelpFormatter
    help.printHelp(this.getClass.getSimpleName, options)
  }

  def buildOptions(): Options = {
    val options = new Options
    options.addOption("input", true , "[must] input path")
    options.addOption("output", true , "[must] input path")
    options
  }
}

object FindUnmatchBundle {
  def main(args: Array[String]): Unit = {
    new FindUnmatchBundle().run(args)
  }
}
