package mobvista.dmp.datasource.apptag

import java.net.URI

import mobvista.dmp.common.{CommonSparkJob, MobvistaConstant}
import org.apache.commons.cli.Options
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.sql.SparkSession

import scala.collection.JavaConverters._
import scala.collection.mutable


class CrawPkgsSpark extends CommonSparkJob with Serializable {

  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("pkginstallpath", true, "[must] pkginstallpath")
    options.addOption("coalesce", true, "[must] coalesce")
    options.addOption("yesday", true, "[must] yesday")
    options
  }

  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return -1
    } else printOptions(commandLine)

    val pkginstallpath = commandLine.getOptionValue("pkginstallpath")
    val coalesce = commandLine.getOptionValue("coalesce")
    val yesday = commandLine.getOptionValue("yesday")

    val spark = SparkSession.builder()
      .appName("CrawPkgsSpark")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "snappy")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    val sc = spark.sparkContext
    FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(pkginstallpath), true)

    try {

      /*
      val sql1 =
        s"""
          select b.package_name, b.platform
           |    from (
           |       select t.package_name
           |       from dev.dm_package_black_list t
           |       where t.dt='${yesday}'
           |    ) a
           |    right outer join (
           |      select package_name, platform
           |      from dwh.dm_install_list_v2
           |      where dt='${yesday}'
           |      group by package_name, platform
           |    ) b on a.package_name=b.package_name
           |    where a.package_name is null
        """.stripMargin
      */

      spark.udf.register("getPkgs", getPkgs _)
      spark.udf.register("getPlatform", getPlatform _)

      val sql =
        s"""
           |SELECT b.package_name, getPlatform(b.package_name) platform
           |  FROM
           |    (SELECT package_name
           |      FROM dwh.dmp_install_list LATERAL VIEW explode(getPkgs(install_list)) dmp_table AS package_name WHERE dt = '$yesday' AND business = 'day'
           |      GROUP BY package_name
           |    ) b
           |    LEFT JOIN
           |    (SELECT package_name FROM dev.dm_package_black_list WHERE dt = '$yesday') a
           |  ON a.package_name = b.package_name
           |  WHERE a.package_name IS NULL
           |""".stripMargin

      spark.sql(sql).rdd
        .repartition(coalesce.toInt)
        .map(_.mkString("\t"))
        .saveAsTextFile(pkginstallpath, classOf[GzipCodec])

    } finally {
      spark.stop()
    }
    0
  }

  def getPlatform(packageName: String): String = {
    if (MobvistaConstant.adrPkgPtn.matcher(packageName).matches()) {
      "android"
    } else {
      "ios"
    }
  }

  def getPkgs(installList: String): mutable.WrappedArray[String] = {
    mutable.WrappedArray.make[String](
      MobvistaConstant.String2JSONObject(installList.replaceAll(" ", "")).keySet().asScala.filter(k => {
        k.startsWith("org.chromium.webapk.")
      }).toArray)
  }
}

object CrawPkgsSpark {
  def main(args: Array[String]): Unit = {
    new CrawPkgsSpark().run(args)
  }
}
