package mobvista.dmp.datasource.newtag

import java.net.URI

import com.alibaba.fastjson.JSON
import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.format.MultipleOrcOutputFormat
import mobvista.dmp.util.{DateUtil, MRUtils}
import mobvista.prd.datasource.util.GsonUtil
import org.apache.commons.cli.Options
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.Text
import org.apache.orc.mapred.OrcStruct
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer

/**
 * 使用新标签体系为安装列表设备打标签，
 * 标签包含人工标注和应用商店映射到新标签体系两种
 * 总体逻辑：
 * 1.MV包标签表与人工标注包标签表取差集，找出为人工标注的包
 * 2.对步骤1数据进行拆分后映射到新标签体系标签
 * 3.步骤2数据与人工标注包信息合并后与安装列表join，为设备打标签
 */
class MatchInterestTagDailyV2 extends CommonSparkJob with Serializable {

  val schema = "struct<device_id:string,device_type:string,platform:string,package_name:string,interest_tag:string>"

  var tagMap: Broadcast[scala.collection.Map[String, (String, String)]] = null

  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return 1
    } else {
      printOptions(commandLine)
    }

    val date = commandLine.getOptionValue("date")
    val coalesce = commandLine.getOptionValue("coalesce")
    val business = commandLine.getOptionValue("business")
    val storeOutput = commandLine.getOptionValue("storeOutput")
    val manualOutput = commandLine.getOptionValue("manualOutput")

    val spark = SparkSession
      .builder()
      .appName(s"MatchInterestTagDailyV2.$date.$business")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "lz4")
      .config("spark.io.compression.lz4.blockSize", "64k")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.autoBroadcastJoinThreshold", "209715200")
      .config("spark.sql.broadcastTimeout", "1200")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(manualOutput), true)
    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(storeOutput), true)


    import spark.implicits._

    try {
      val sc = spark.sparkContext

      // dim_app_tag 与 dim_package_tags表取差集, 拆分标签
      var sql =
        s"""
           |SELECT
           |  /*+ MAPJOIN(b) */
           |  a.package_name,
           |  a.platform,
           |  a.tag
           |FROM (
           |       SELECT
           |         t.package_name,
           |         CASE WHEN t.platform = 'ios' THEN 'ios' ELSE 'android' END AS platform,
           |         t.tag
           |       FROM dwh.dim_app_tag t
           |       WHERE concat(t.year, t.month, t.day) = '$date'
           |     ) a
           |LEFT OUTER JOIN dwh.dim_package_tags b
           |    ON a.package_name = b.package_name AND a.platform = b.platform
           |WHERE b.package_name IS NULL
        """.stripMargin
      spark.sql(sql)
        .flatMap(flatInterest)
        .toDF()
        .createOrReplaceTempView("t_app_two_tags")

      // 将原有MV标签替换成新标签体系标签
      sql =
        """
          |select /*+ MAPJOIN(a) */ b.package_name, b.platform, b.tag_type, a.new_first_tag_1 as first_tag,
          |  a.new_second_tag_1 as second_tag
          |from dwh.dim_category_mv_new a
          |join t_app_two_tags b on a.mv_first_tag=b.first_tag and a.mv_second_tag=b.second_tag
          |group by  b.package_name, b.platform, b.tag_type, a.new_first_tag_1, a.new_second_tag_1
        """.stripMargin
      spark.sql(sql)
        .createOrReplaceTempView("t_app_two_tags_new")

      val tag_sql =
        """
          |select upper(concat(tag_type, '-', first_tag, '-', second_tag)) tag_id, new_first_id, new_second_id from dwh.dm_old2new_tag
          |""".stripMargin

      tagMap = sc.broadcast(spark.sql(tag_sql).rdd.map(row => {
        (row.getAs("tag_id").toString, (row.getAs("new_first_id").toString, row.getAs("new_second_id").toString))
      }).collectAsMap())

      spark.udf.register("get_tag_code", getTagCode _)

      spark.udf.register("toTags", toTags _)

      sql =
        s"""
           |select package_name, platform, toTags(concat_ws(',',collect_set(get_tag_code(upper(concat(tag_type, '-', first_tag, '-', second_tag)))))) tag_code, '$storeOutput' as path
           |  from t_app_two_tags_new group by package_name, platform
           |  union all
           |select package_name, platform, toTags(concat_ws(',',collect_set(get_tag_code(upper(concat(tag_type, '-', first_tag, '-', second_tag)))))) tag_code, '$manualOutput' as path
           |  from dwh.dim_package_tags group by package_name, platform
        """.stripMargin

      val bMap = sc.broadcast(spark.sql(sql).rdd.map(row => {
        (row.getAs("package_name").toString, MRUtils.JOINER.join(row.getAs("tag_code").toString, row.getAs("path").toString))
      }).collectAsMap())

      val update_date = if (business.equals("14days")) {
        DateUtil.format(DateUtil.getDay(date, "yyyyMMdd", -14), "yyyy-MM-dd")
      } else {
        DateUtil.format(DateUtil.parse(date, "yyyyMMdd"), "yyyy-MM-dd")
      }
      sql =
        s"""
           |select device_id, device_type, platform, install_list
           |  from dwh.dmp_install_list
           |  where dt='$date' and business='$business' and update_date >= '$update_date'
        """.stripMargin
      val install_df = spark.sql(sql).rdd.map(row => {
        val res = new ArrayBuffer[DeviceTagDaily]()
        val device_id = row.getAs("device_id").toString
        val device_type = row.getAs("device_type").toString
        val platform = row.getAs("platform").toString
        val install_list = row.getAs("install_list").toString

        JSON.parseObject(install_list).keySet().foreach(pkg => {
          val tag = bMap.value.getOrDefault(pkg, "")
          if (StringUtils.isNotBlank(tag)) {
            val arr = MRUtils.SPLITTER.split(tag)
            res += DeviceTagDaily(device_id, device_type, platform, pkg, arr(0), arr(1))
          }
        })
        res
      })
      install_df.flatMap(l => l)
        .mapPartitions(v => new CustomIteratorDailyV2(v))
        .repartition(coalesce.toInt)
        .saveAsNewAPIHadoopFile(manualOutput, classOf[Text], classOf[OrcStruct], classOf[MultipleOrcOutputFormat[Text, OrcStruct]],
          initConfig(spark.sparkContext.hadoopConfiguration))
    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  def initConfig(conf: Configuration): Configuration = {
    //  import org.apache.hadoop.io.SequenceFile
    conf.set("orc.mapred.output.schema", schema)
    conf.setBoolean("mapreduce.output.compress", true)
    conf.set("mapreduce.output.compression.codec", "com.hadoop.compression.lzo.LzoCodec")
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true)
    conf.set("mapreduce.output.fileoutputformat.compress.type", "com.hadoop.compression.lzo.LzoCodec")
    conf.set("orc.compress", "ZLIB")
    conf
  }

  def getTagCode(tag_id: String): String = {
    val tags = tagMap.value.getOrElse(tag_id, ("00000000", "00000000"))
    tags._1 + "," + tags._2
  }

  def toTags(tags: String): String = {
    tags.split(",", -1).filter(tag => {
      StringUtils.isNotBlank(tag) && !tag.equals("00000000")
    }).toSet[String].toList.sorted.mkString(",")
  }

  def flatInterest(row: Row): Array[PackageTag] = {
    val buffer = new ArrayBuffer[PackageTag]()
    GsonUtil.String2JsonArray(row.getString(2))
      .foreach(element => {
        val obj = element.getAsJsonObject
        val firstTag = obj.get("1").getAsString
        val secondTag = if (obj.get("2") != null) obj.get("2").getAsString else ""
        buffer += PackageTag(row.getString(0), row.getString(1), "category", firstTag, secondTag)
      })
    buffer.toArray
  }


  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("date", true, "[must] date")
    options.addOption("coalesce", true, "[must] coalesce")
    options.addOption("business", true, "[must] business")
    options.addOption("manualOutput", true, "[must] manualOutput path")
    options.addOption("storeOutput", true, "[must] store tag output path")
    options
  }
}

object MatchInterestTagDailyV2 {
  def main(args: Array[String]): Unit = {
    new MatchInterestTagDailyV2().run(args)
  }
}