package mobvista.dmp.datasource.newtag

import java.net.URI

import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.format.MultipleOrcOutputFormat
import mobvista.prd.datasource.util.GsonUtil
import org.apache.commons.cli.Options
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.Text
import org.apache.orc.TypeDescription
import org.apache.orc.mapred.OrcStruct
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer

/**
  * 使用新标签体系为安装列表设备打标签，
  * 标签包含人工标注和应用商店映射到新标签体系两种
  * 总体逻辑：
  *   1.MV包标签表与人工标注包标签表取差集，找出为人工标注的包
  *   2.对步骤1数据进行拆分后映射到新标签体系标签
  *   3.步骤2数据与人工标注包信息合并后与安装列表join，为设备打标签
  */
class MatchInterestTagV2 extends CommonSparkJob with Serializable {

  val schema = "struct<device_id:string,device_type:string,platform:string,package_name:string,tag_code:string>"

  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return 1
    } else {
      printOptions(commandLine)
    }

    val date = commandLine.getOptionValue("date")
    val business = commandLine.getOptionValue("business")
    val coalesce = commandLine.getOptionValue("coalesce")
    val storeOutput = commandLine.getOptionValue("storeOutput")
    val manualOutput = commandLine.getOptionValue("manualOutput")

    val spark = SparkSession
      .builder()
      .appName(s"dmp_MatchInterestTagV2_wangjf.$business.$date")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "snappy")
      .config("spark.sql.autoBroadcastJoinThreshold", "268435456")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    val sc = spark.sparkContext
    FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(manualOutput), true)
    FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(storeOutput), true)

    try {
      // dim_app_tag 与 dim_package_tags表取差集, 拆分标签
      var sql =
        s"""
           |SELECT
           |  /*+ MAPJOIN(b) */
           |  a.package_name,
           |  a.platform,
           |  a.tag
           |FROM (
           |       SELECT
           |         t.package_name,
           |         CASE WHEN t.platform = 'ios' THEN 'ios' ELSE 'android' END AS platform,
           |         t.tag
           |       FROM dwh.dim_app_tag t
           |       WHERE concat(t.year, t.month, t.day) = '${date}'
           |     ) a
           |LEFT OUTER JOIN dwh.dim_package_tags b
           |    ON a.package_name = b.package_name AND a.platform = b.platform
           |WHERE b.package_name IS NULL
        """.stripMargin
      import spark.implicits._
      spark.sql(sql)
        .flatMap(flatInterest)
        .toDF()
        .createOrReplaceTempView("t_app_two_tags")

      // 将原有MV标签替换成新标签体系标签
      sql =
        """
          |select /*+ MAPJOIN(a) */ b.package_name, b.platform, b.tag_type, a.new_first_tag_1 as first_tag,
          |  a.new_second_tag_1 as second_tag
          |from dwh.dim_category_mv_new a
          |join t_app_two_tags b on a.mv_first_tag=b.first_tag and a.mv_second_tag=b.second_tag
          |group by  b.package_name, b.platform, b.tag_type, a.new_first_tag_1, a.new_second_tag_1
        """.stripMargin
      spark.sql(sql)
        .createOrReplaceTempView("t_app_two_tags_new")

      spark.sql(sql)
      sql =
        s"""
           |select package_name, platform, concat_ws(',',collect_set(tag_code)) tag_code, path from
           |  (select /*+ MAPJOIN(a) */ b.package_name, b.platform, a.tag_code, b.path
           |  from
           |  (select upper(concat(tag_type, '-', first_tag, '-', second_tag)) tag_id, new_second_id tag_code from dwh.dm_old2new_tag) a
           |  join
           |  (select package_name, platform, upper(concat(tag_type, '-', first_tag, '-', second_tag)) tag_id, '${storeOutput}' as path
           |    from t_app_two_tags_new
           |    union all
           |    select package_name, platform, upper(concat(tag_type, '-', first_tag, '-', second_tag)) tag_id, '${manualOutput}' as path
           |    from dwh.dim_package_tags
           |  ) b
           |  on a.tag_id = b.tag_id
           |) t
           |group by package_name, platform, path
          """.stripMargin

      spark.sql(sql).createOrReplaceTempView("package_tags")

      sql =
        s"""
           |select device_id, device_type, platform, package_name, update_date
           |  from dwh.dm_install_list_v2
           |  where dt='${date}' and business='${business}'
        """.stripMargin
      //  val install_df = spark.sql(sql).persist(StorageLevel.MEMORY_AND_DISK_SER)
      spark.sql(sql).createOrReplaceTempView("install_list")

      sql =
        s"""
           |select /*+ MAPJOIN(a) */ b.device_id device_id, b.device_type,
           |  b.platform, b.package_name,
           |  a.tag_code tag_code, b.update_date, a.path path
           |  from package_tags a join install_list b
           |  on a.platform = b.platform and a.package_name = b.package_name
        """.stripMargin

      spark.sql(sql)
        .rdd
        .mapPartitions(v => new CustomIteratorV2(v))
        .repartition(coalesce.toInt)
        .saveAsNewAPIHadoopFile(manualOutput, classOf[Text], classOf[OrcStruct], classOf[MultipleOrcOutputFormat[Text, OrcStruct]],
          initConfig(spark.sparkContext.hadoopConfiguration))

      /*
      spark.sql(sql).coalesce(1).write
        .mode(SaveMode.Overwrite)
        .option("orc.compress", "zlib")
        .orc(manualOutput)
        */

    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  def initConfig(conf: Configuration): Configuration = {
    conf.set("orc.mapred.output.schema", schema)
    conf.setBoolean("mapreduce.output.compress", true)
    conf.set("mapreduce.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true)
    conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec")
    conf.set("orc.compress", "ZLIB")
    conf
  }

  def flatInterest(row: Row): Array[PackageTag] = {
    val buffer = new ArrayBuffer[PackageTag]()
    GsonUtil.String2JsonArray(row.getString(2))
      .foreach(element => {
        val obj = element.getAsJsonObject
        val firstTag = obj.get("1").getAsString
        val secondTag = if (obj.get("2") != null) obj.get("2").getAsString else ""
        buffer += PackageTag(row.getString(0), row.getString(1), "category", firstTag, secondTag)
      })
    buffer.toArray
  }


  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("date", true, "[must] date")
    options.addOption("business", true, "[must] business")
    options.addOption("coalesce", true, "[must] coalesce")
    options.addOption("manualOutput", true, "[must] manualOutput path")
    options.addOption("storeOutput", true, "[must] store tag output path")
    options
  }

  //  def processDailyData(date: String, spark: SparkSession): DataFrame
}

class CustomIteratorV2(iter: Iterator[Row]) extends Iterator[(Text, OrcStruct)] {
  val schema = "struct<device_id:string,device_type:string,platform:string,package_name:string,tag_code:string,update_date:string>"

  def hasNext: Boolean = {
    iter.hasNext
  }


  def next: (Text, OrcStruct) = {
    val row = iter.next
    val deviceId = row.getString(0)
    val deviceType = row.getString(1)
    val platform = row.getString(2)
    val packageName = row.getString(3)
    val tagCode = row.getString(4)
    val updateDate = row.getString(5)
    val path = row.getString(6)

    val struct = OrcStruct.createValue(TypeDescription.fromString(schema)).asInstanceOf[OrcStruct]
    struct.setFieldValue(0, new Text(deviceId))
    struct.setFieldValue(1, new Text(deviceType))
    struct.setFieldValue(2, new Text(platform))
    struct.setFieldValue(3, new Text(packageName))
    struct.setFieldValue(4, new Text(tagCode))
    struct.setFieldValue(5, new Text(updateDate))
    (new Text(path), struct)
  }
}

object MatchInterestTagV2 {
  def main(args: Array[String]): Unit = {
    new MatchInterestTagV2().run(args)
  }
}