package mobvista.dmp.datasource.apptag

import java.net.URI
import java.util.{Date, Random}
import java.util.regex.Pattern

import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.datasource.apptag.crawler_spark.{CaiNiXiHuanAdrDetailVisitor, CaiNiXiHuanIosDetailVisitor, BundleVisitor}
import mobvista.dmp.format.TextMultipleOutputFormat
import mobvista.dmp.util.MRUtils
import org.apache.commons.cli.Options
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.io.Text
import org.apache.http.HttpHost
import org.apache.spark.HashPartitioner
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{SaveMode, SparkSession}

import scala.collection.mutable.ArrayBuffer


class CaiNiXiHuanCrawlerSpark extends CommonSparkJob with Serializable {

  val adrPkgPtn = Pattern.compile("^[^.]+\\..*$");
  val iosPkgPtn = Pattern.compile("^\\d{8,}$");
  val fixPkgPtn = Pattern.compile("^id\\w+");
  val random = new Random(System.currentTimeMillis());

  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return 1
    } else {
      printOptions(commandLine)
    }

    val input_path = commandLine.getOptionValue("input")
    val coalesce = commandLine.getOptionValue("coalesce").toInt
    val iosoutput = commandLine.getOptionValue("iosoutput")
    val adroutput = commandLine.getOptionValue("adroutput")
    val bundleoutput = commandLine.getOptionValue("bundleoutput")
    val proxy_path = commandLine.getOptionValue("proxy_path")


    val spark = SparkSession
      .builder()
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "lz4")
      .config("spark.io.compression.lz4.blockSize", "64k")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.autoBroadcastJoinThreshold", "209715200")
      .config("spark.sql.broadcastTimeout", "1200")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()


        FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(iosoutput), true)
        FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(adroutput), true)
        FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(bundleoutput), true)

    try {
      val sc = spark.sparkContext
      val proxy = sc.textFile(proxy_path).map(line => {
        new HttpHost(line.split(":")(0), line.split(":")(1).toInt)
      }).collect()
      val proxyMap = sc.broadcast(proxy)

      val originRdd = sc.textFile(input_path).filter { x => x.split("\t").length == 2 }.map(
        line => parseRow(line)).filter(_.length > 0 )

      originRdd.persist()

      originRdd.filter(a=>{a.split("\t")(1)=="adr" || a.split("\t")(1)=="android" }).filter { x => x.split("\t").length == 2 }.map(
        line => parseRow(line)).filter(_.length > 0).zipWithIndex().map(line => {
        new Tuple2(line._2, line._1)
      }).partitionBy(new HashPartitioner(coalesce.toInt)).map(line=>{buildResult(line._2,proxyMap.value)})
        .filter(a=>{StringUtils.isNotBlank(a._2)}).map(_._2).saveAsTextFile(adroutput)

      val iosRes = originRdd.filter(_.split("\t")(1)=="ios").filter { x => x.split("\t").length == 2 }.map(
        line => parseRow(line)).filter(_.length > 0).zipWithIndex().map(line => {
        new Tuple2(line._2, line._1)
      }).partitionBy(new HashPartitioner(coalesce.toInt)).map(line=>{buildResult(line._2,proxyMap.value)})
        .filter(a=>{StringUtils.isNotBlank(a._2)})
      iosRes.persist()
      iosRes.filter(a=>{a._2.split('\004')(1)=="0"}).map(_._2.split('\004')(0)).saveAsTextFile(s"${iosoutput}/success")
      iosRes.filter(a=>{a._2.split('\004')(1)!="0"}).map(a=>{
        val errorCode = a._2.split('\004')(1)
        s"${a._1}\t${errorCode}"
      }).saveAsTextFile(s"${iosoutput}/failed")

      //originRdd.filter(_.split("\t")(1)=="bundle").map(a=>{buildResult(a,proxyMap.value)}).filter(a=>{StringUtils.isNotBlank(a)}).saveAsTextFile(bundleoutput)
    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }


  def buildResult(pkgPlatform:String, proxyMap:Array[HttpHost]): (String, String) = {
    if (StringUtils.isNotBlank(pkgPlatform)) {
      val httpproxy = nextRandom(proxyMap)
      val pkg = pkgPlatform.split("\t")(0)
      val platform = pkgPlatform.split("\t")(1)
      if ("ios".equalsIgnoreCase(platform)) {
        val iosDetailVisitor = new CaiNiXiHuanIosDetailVisitor(httpproxy, "")
        (pkgPlatform,iosDetailVisitor.visit(pkg))
      } else if ("adr".equalsIgnoreCase(platform) || "android".equalsIgnoreCase(platform)) {
        val adrDetailVisitor = new CaiNiXiHuanAdrDetailVisitor(httpproxy, "")
        (pkgPlatform,adrDetailVisitor.visit(pkg))
      } else if ("bundle".equalsIgnoreCase(platform)) {
        val bundleVisitor = new BundleVisitor(httpproxy, "")
        (pkgPlatform,bundleVisitor.visit(pkg))
      }else{
        (pkgPlatform,null)
      }
    }else{
      (pkgPlatform,null)
    }
  }

  def initConfig(conf: Configuration): Configuration = {
    import org.apache.hadoop.io.SequenceFile
    conf.set("mapreduce.output.compress", "true")
    conf.set("mapreduce.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true)
    conf.set("mapreduce.output.fileoutputformat.compress.type", SequenceFile.CompressionType.BLOCK.toString)
    conf
  }


  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("input", true, "[must] input")
    options.addOption("coalesce", true, "[must] coalesce")
    options.addOption("iosoutput", true, "[must] iosoutput path")
    options.addOption("bundleoutput", true, "[must] bundleoutput path")
    options.addOption("adroutput", true, "[must] adroutput path")
    options.addOption("proxy_path", true, "[must] set proxy_path")
    options
  }


  def parseRow(line: String): String = {
    val p0 = line.split("\t")(0) // pkg
    val orgPlatform = line.split("\t")(1) // platform
    val pkg = if (fixPkgPtn.matcher(p0).find()) p0.substring(2) else p0


    if (!StringUtils.isBlank(pkg)) {
      val platform = fixPlatform(pkg, orgPlatform)
      if (StringUtils.isNotBlank(platform)) { //这种包才符合处理条件
        return MRUtils.JOINER.join(pkg, platform)
      }
    }
    return "";
  }

  def fixPlatform(pkg: String, orgPlatform: String): String = {
    if (adrPkgPtn.matcher(pkg).matches()) {
      if ("adr".equals(orgPlatform.toLowerCase()) || "android".equals(orgPlatform.toLowerCase())) {
        return "adr";
      } else if ("ios".equals(orgPlatform.toLowerCase())) {
        return "bundle";
      }
    } else if (iosPkgPtn.matcher(pkg).matches()) {
      return "ios";
    }
    return null;
  }

  def nextRandom(proxyHosts: Array[HttpHost]): HttpHost = {
    val n = random.nextInt() % proxyHosts.length;
    val idx = if (n < 0) n + proxyHosts.length else n;
    return proxyHosts(idx)
  }
}


object CaiNiXiHuanCrawlerSpark {
  def main(args: Array[String]): Unit = {
    new CaiNiXiHuanCrawlerSpark().run(args)
    //    new AppInfoCrawlerSpark().testrun(args)
  }
}