package mobvista.dmp.datasource.newtag

import mobvista.dmp.common.CommonSparkJob
import mobvista.prd.datasource.util.GsonUtil
import org.apache.commons.cli.Options
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.SequenceFile
import org.apache.spark.sql.{Row, SaveMode, SparkSession}
import org.apache.spark.storage.StorageLevel

import java.net.URI
import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer

/**
  * 使用新标签体系为安装列表设备打标签，
  * 标签包含人工标注和应用商店映射到新标签体系两种
  * 总体逻辑：
  *   1.MV包标签表与人工标注包标签表取差集，找出为人工标注的包
  *   2.对步骤1数据进行拆分后映射到新标签体系标签
  *   3.步骤2数据与人工标注包信息合并后与安装列表join，为设备打标签
  */
class MatchInterestTag extends CommonSparkJob with Serializable {

  val schema = "struct<device_id:string,device_type:string,platform:string,package_name:string,tag_type:string,first_tag:string,second_tag:string,update_date:string>"

  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return 1
    } else {
      printOptions(commandLine)
    }

    val date = commandLine.getOptionValue("date")
    val coalesce = commandLine.getOptionValue("coalesce")
    val business = commandLine.getOptionValue("business")
    val storeOutput = commandLine.getOptionValue("storeOutput")
    val manualOutput = commandLine.getOptionValue("manualOutput")

    val spark = SparkSession
      .builder()
      .appName("dmp_MatchInterestTag_fengliang")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "lz4")
      .config("spark.io.compression.lz4.blockSize", "64k")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.autoBroadcastJoinThreshold", "209715200")
      .config("spark.sql.broadcastTimeout","1200")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(manualOutput), true)
    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(storeOutput), true)


    import spark.implicits._

    try {

      // dim_app_tag 与 dim_package_tags表取差集, 拆分标签
      var sql =
        s"""
           |SELECT
           |  /*+ MAPJOIN(b) */
           |  a.package_name,
           |  a.platform,
           |  a.tag
           |FROM (
           |       SELECT
           |         t.package_name,
           |         CASE WHEN t.platform = 'ios' THEN 'ios' ELSE 'android' END AS platform,
           |         t.tag
           |       FROM dwh.dim_app_tag t
           |       WHERE concat(t.year, t.month, t.day) = '${date}'
           |     ) a
           |LEFT OUTER JOIN dwh.dim_package_tags b
           |    ON a.package_name = b.package_name AND a.platform = b.platform
           |WHERE b.package_name IS NULL
        """.stripMargin
      spark.sql(sql)
        .flatMap(flatInterest(_))
        .toDF()
        .createOrReplaceTempView("t_app_two_tags")

      // 将原有MV标签替换成新标签体系标签
      sql =
        """
          |select /*+ MAPJOIN(a) */ b.package_name, b.platform, b.tag_type, a.new_first_tag_1 as first_tag,
          |  a.new_second_tag_1 as second_tag
          |from dwh.dim_category_mv_new a
          |join t_app_two_tags b on a.mv_first_tag=b.first_tag and a.mv_second_tag=b.second_tag
          |group by  b.package_name, b.platform, b.tag_type, a.new_first_tag_1, a.new_second_tag_1
        """.stripMargin
      spark.sql(sql)
        .createOrReplaceTempView("t_app_two_tags_new")

      sql =
        s"""
           |select device_id, device_type, platform, package_name, update_date
           |  from dwh.dm_install_list_v2
           |  where dt='${date}' and business='${business}'
        """.stripMargin
      val install_df = spark.sql(sql).persist(StorageLevel.MEMORY_AND_DISK_SER)
      install_df.createOrReplaceTempView("install_list")

      sql =
        """
          |select x.package_name,x.platform,concat_ws(',', collect_set(concat(x.tag_type,'#',x.first_tag, '#', x.second_tag))) tags
          |from(
          |   select package_name,platform,
          |   tag_type,
          |    first_tag,
          |  case when second_tag is null then "" else second_tag end as second_tag
          |  from dwh.dim_package_tags) x
          |  group by x.package_name,x.platform
        """.stripMargin

      val dim_package_tags = spark.sql(sql).persist(StorageLevel.MEMORY_AND_DISK_SER)
      dim_package_tags.createOrReplaceTempView("dim_package_tags")

      /*    sql =
            """
              |select /*+ MAPJOIN(a) */ b.device_id, b.device_type, b.platform, b.package_name, a.tag_type,
              |  a.first_tag,
              |  case when a.second_tag is null then "" else a.second_tag end as second_tag,
              |  b.update_date
              |from  dim_package_tags a
              |join install_list b
              |on a.package_name = b.package_name and a.platform = b.platform
            """.stripMargin */

      sql =
        """
          |select /*+ MAPJOIN(a) */ b.device_id, b.device_type, b.platform, b.package_name,
          |  a.tags,
          |  b.update_date
          |from  dim_package_tags a
          |join install_list b
          |on a.package_name = b.package_name and a.platform = b.platform
        """.stripMargin

      spark.sql(sql)
        .repartition(coalesce.toInt)
        .write
        .mode(SaveMode.Overwrite)
        .option("orc.compress", "zlib")
        .orc(manualOutput)

      sql =
        """
          |select x.package_name,x.platform,concat_ws(',', collect_set(concat(x.tag_type,'#',x.first_tag, '#', x.second_tag))) tags
          |from(
          |   select package_name,platform,
          |   tag_type,
          |    first_tag,
          |  case when second_tag is null then "" else second_tag end as second_tag
          |  from t_app_two_tags_new) x
          |  group by x.package_name,x.platform
        """.stripMargin

      val t_app_two_tags_store = spark.sql(sql).persist(StorageLevel.MEMORY_AND_DISK_SER)
      t_app_two_tags_store.createOrReplaceTempView("t_app_two_tags_store")



      /*sql =
        """
          |select /*+ MAPJOIN(a) */ b.device_id, b.device_type, b.platform, b.package_name, a.tag_type,
          |  a.first_tag,
          |  case when a.second_tag is null then "" else a.second_tag end as second_tag,
          |  b.update_date
          |from t_app_two_tags_new a
          |join install_list b
          |on a.package_name = b.package_name and a.platform = b.platform
        """.stripMargin*/

      sql =
        """
          |select /*+ MAPJOIN(a) */ b.device_id, b.device_type, b.platform, b.package_name, a.tags,
          |  b.update_date
          |from t_app_two_tags_store a
          |join install_list b
          |on a.package_name = b.package_name and a.platform = b.platform
        """.stripMargin

      spark.sql(sql)
        .repartition(coalesce.toInt)
        .write
        .mode(SaveMode.Overwrite)
        .option("orc.compress", "zlib")
        .orc(storeOutput)

    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  def initConfig(conf: Configuration): Configuration = {
    conf.set("orc.mapred.output.schema", schema)
    conf.set("mapreduce.output.compress", "true")
    conf.set("mapreduce.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true)
    conf.set("mapreduce.output.fileoutputformat.compress.type", SequenceFile.CompressionType.BLOCK.toString)
    conf
  }

  def flatInterest(row: Row): Array[PackageTag] = {
    val buffer = new ArrayBuffer[PackageTag]()
    GsonUtil.String2JsonArray(row.getString(2))
      .foreach(element => {
        val obj = element.getAsJsonObject
        val firstTag = obj.get("1").getAsString
        val secondTag = if (obj.get("2") != null) obj.get("2").getAsString else ""
        buffer += PackageTag(row.getString(0), row.getString(1), "category", firstTag, secondTag)
      })
    buffer.toArray
  }


  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("date", true, "[must] date")
    options.addOption("coalesce", true, "[must] coalesce")
    options.addOption("business", true, "[must] business")
    options.addOption("manualOutput", true, "[must] manualOutput path")
    options.addOption("storeOutput", true, "[must] store tag output path")
    options
  }
}

object MatchInterestTag {
  def main(args: Array[String]): Unit = {
    new MatchInterestTag().run(args)
  }
}