package mobvista.dmp.datasource.apptag

import java.net.URI
import java.util.Random
import java.util.regex.Pattern

import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.datasource.apptag.crawler_spark.{AdrDetailVisitor, BundleVisitor, IosDetailVisitor}
import mobvista.dmp.format.TextMultipleOutputFormat
import mobvista.dmp.util.MRUtils
import org.apache.commons.cli.Options
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.Text
import org.apache.http.HttpHost
import org.apache.spark.HashPartitioner
import org.apache.spark.sql.SparkSession


class AppInfoCrawlerSpark extends CommonSparkJob with Serializable {

  val adrPkgPtn = Pattern.compile("^\\w+\\..*$");
  val iosPkgPtn = Pattern.compile("^\\d{8,}$");
  val fixPkgPtn = Pattern.compile("^id\\w+");
  val random = new Random(System.currentTimeMillis());

  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return 1
    } else {
      printOptions(commandLine)
    }

    val input_path = commandLine.getOptionValue("input")
    val today = commandLine.getOptionValue("today")
    val coalesce = commandLine.getOptionValue("coalesce")
    val iosoutput = commandLine.getOptionValue("iosoutput")
    val adroutput = commandLine.getOptionValue("adroutput")
    val bundleoutput = commandLine.getOptionValue("bundleoutput")


    val spark = SparkSession
      .builder()
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "lz4")
      .config("spark.io.compression.lz4.blockSize", "64k")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.autoBroadcastJoinThreshold", "209715200")
      .config("spark.sql.broadcastTimeout", "1200")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()


    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(iosoutput), true)
    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(adroutput), true)
    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(bundleoutput), true)

    try {
      //
      val sc = spark.sparkContext
      val proxy = sc.textFile("s3://mob-emr-test/andy/dmp/proxy").map(line => {
        new HttpHost(line.split(":")(0), line.split(":")(1).toInt)
      }).collect()
      val proxyMap = sc.broadcast(proxy)

      /* 测试通过：
      sc.textFile(input_path).filter { x => x.split("\t").length == 2 }.map(
         line => parseRow(line)).filter(_.length > 0 ).zipWithIndex().map(line => {
         new Tuple2(line._2, line._1)
       }).partitionBy(new HashPartitioner(100)).map(line =>{buildResultRow(line._2,proxyMap.value,today,iosoutput,adroutput,bundleoutput)})
           .saveAsTextFile(s"${iosoutput}")*/

      /*
    sc.textFile(input_path).filter { x => x.split("\t").length == 2 }.map(
        line => parseRow(line)).filter(_.length > 0 ).zipWithIndex().map(line => {
        new Tuple2(line._2, line._1)
      }).partitionBy(new HashPartitioner(coalesce.toInt)).map(line =>{buildResultRow(line._2,proxyMap.value,today,iosoutput,adroutput,bundleoutput)})  */

      sc.textFile(input_path).filter { x => x.split("\t").length == 2 }.map(
        line => parseRow(line)).filter(_.length > 0).zipWithIndex()
        .sample(withReplacement = false, 0.1)
        .map(line => {
          new Tuple2(line._2, line._1)
        }).partitionBy(new HashPartitioner(coalesce.toInt))
        .map(line => {
          buildResult(line._2, proxyMap.value, today, iosoutput, adroutput, bundleoutput)
        }).filter(t => {
        t != null
      }).saveAsNewAPIHadoopFile(s"${iosoutput}", classOf[Text], classOf[Text], classOf[TextMultipleOutputFormat], spark.sparkContext.hadoopConfiguration)

    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }


  def buildResult(pkgPlatform: String, proxyMap: Array[HttpHost], today: String, iosoutput: String, adroutput: String, bundleoutput: String): (Text, Text) = {
    //  val buffer = new ArrayBuffer[Tuple2[Text, Text]]()
    var tuple: (Text, Text) = null
    if (StringUtils.isNotBlank(pkgPlatform)) {
      val httpproxy = nextRandom(proxyMap)
      val pkg = pkgPlatform.split("\t")(0)
      val platform = pkgPlatform.split("\t")(1)

      if ("ios".equalsIgnoreCase(platform)) {
        val iosDetailVisitor = new IosDetailVisitor(httpproxy, today)
        val result = iosDetailVisitor.visit(pkg)
        if (StringUtils.isNotBlank(result)) {
          tuple = (new Text(s"${iosoutput}, "), new Text(result))
        }
      } else if ("adr".equalsIgnoreCase(platform)) {
        val adrDetailVisitor = new AdrDetailVisitor(httpproxy, today)
        val result = adrDetailVisitor.visit(pkg)
        if (StringUtils.isNotBlank(result)) {
          tuple = (new Text(s"${adroutput}, "), new Text(result))
        }
      } else if ("bundle".equalsIgnoreCase(platform)) {
        val bundleVisitor = new BundleVisitor(httpproxy, today)
        val result = bundleVisitor.visit(pkg)
        if (StringUtils.isNotBlank(result)) {
          tuple = (new Text(s"${bundleoutput}, "), new Text(result))
        }
      }
    }
    tuple
  }


  def buildResultRow(pkgPlatform: String, proxyMap: Array[HttpHost], today: String, iosoutput: String, adroutput: String, bundleoutput: String): Tuple2[String, String] = {

    var row = new Tuple2("", "")
    if (StringUtils.isNotBlank(pkgPlatform)) {
      val httpproxy = nextRandom(proxyMap)
      val pkg = pkgPlatform.split("\t")(0)
      val platform = pkgPlatform.split("\t")(1)

      /* Option(pkgPlatform) match {
    case Some(pkgPlatform) =>{
      val httpproxy = nextRandom(proxyMap)
      val pkg = pkgPlatform.split("\t")(0)
      val platform = pkgPlatform.split("\t")(1)

      platform match {
        case "ios" => {
          val iosDetailVisitor = new IosDetailVisitor(httpproxy, today)
          val result = iosDetailVisitor.visit(pkg)
          Option(result) match {
            case Some(result) => result
          }
        }
        case "adr" =>{
          val adrDetailVisitor = new AdrDetailVisitor(httpproxy, today)
          val result = adrDetailVisitor.visit(pkg)
          Option(result) match {
            case Some(result) => result
          }
        }

        case "bundle" =>{
          val bundleVisitor = new BundleVisitor(httpproxy, today)
          val result = bundleVisitor.visit(pkg)
          Option(result) match {
            case Some(result) => result
          }
        }
       // case _ =>
      }
    }
     // case None =>_
    }*/



      if ("ios".equalsIgnoreCase(platform)) {
        val iosDetailVisitor = new IosDetailVisitor(httpproxy, today)
        val result = iosDetailVisitor.visit(pkg)
        if (StringUtils.isNotBlank(result)) {
          row = new Tuple2(s"${iosoutput}, ", result)
        }
      } else if ("adr".equalsIgnoreCase(platform)) {
        val adrDetailVisitor = new AdrDetailVisitor(httpproxy, today)
        val result = adrDetailVisitor.visit(pkg)
        if (StringUtils.isNotBlank(result)) {
          row = new Tuple2(s"${adroutput}, ", result)
        }
      } else if ("bundle".equalsIgnoreCase(platform)) {
        val bundleVisitor = new BundleVisitor(httpproxy, today)
        val result = bundleVisitor.visit(pkg)
        if (StringUtils.isNotBlank(result)) {
          row = new Tuple2(s"${bundleoutput}, ", result)
        }
      }
    }
    row
  }

  def initConfig(conf: Configuration): Configuration = {
    import org.apache.hadoop.io.SequenceFile
    conf.set("mapreduce.output.compress", "true")
    conf.set("mapreduce.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true)
    conf.set("mapreduce.output.fileoutputformat.compress.type", SequenceFile.CompressionType.BLOCK.toString)
    conf
  }


  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("input", true, "[must] input")
    options.addOption("today", true, "[must] today")
    options.addOption("coalesce", true, "[must] coalesce")
    options.addOption("iosoutput", true, "[must] iosoutput path")
    options.addOption("bundleoutput", true, "[must] bundleoutput path")
    options.addOption("adroutput", true, "[must] adroutput path")
    options
  }


  def parseRow(line: String): String = {
    val p0 = line.split("\t")(0) // pkg
    val orgPlatform = line.split("\t")(1) // platform
    val pkg = if (fixPkgPtn.matcher(p0).find()) p0.substring(2) else p0


    if (!StringUtils.isBlank(pkg)) {
      val platform = fixPlatform(pkg, orgPlatform)
      if (StringUtils.isNotBlank(platform)) { //这种包才符合处理条件
        return MRUtils.JOINER.join(pkg, platform)
      }
    }
    return "";
  }

  def fixPlatform(pkg: String, orgPlatform: String): String = {
    if (adrPkgPtn.matcher(pkg).matches()) {
      if ("adr".equals(orgPlatform.toLowerCase()) || "android".equals(orgPlatform.toLowerCase())) {
        return "adr";
      } else if ("ios".equals(orgPlatform.toLowerCase())) {
        return "bundle";
      }
    } else if (iosPkgPtn.matcher(pkg).matches()) {
      return "ios";
    }
    return null;
  }

  def nextRandom(proxyHosts: Array[HttpHost]): HttpHost = {
    val n = random.nextInt() % proxyHosts.length;
    val idx = if (n < 0) n + proxyHosts.length else n;
    return proxyHosts(idx)
  }
}


object AppInfoCrawlerSpark {
  def main(args: Array[String]): Unit = {
    new AppInfoCrawlerSpark().run(args)
  }
}