package mobvista.dmp.datasource.event_tag

import java.net.URI

import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.format.MultipleOrcOutputFormat
import org.apache.commons.cli.Options
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{Text, WritableComparable}
import org.apache.orc.TypeDescription
import org.apache.orc.mapred.OrcStruct
import org.apache.spark.sql.{Row, SaveMode, SparkSession}
import org.apache.spark.sql.{Encoder, Encoders}

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer


class DmpEventTag extends CommonSparkJob with Serializable {

  val schema = "struct<device_id:string,uuid:string,platform:string,device_type:string,country:string,package_name:string,first_tag:string,second_tag:string,behavior_tag:string,update_date:string>"


  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return -1
    } else printOptions(commandLine)

    val output = commandLine.getOptionValue("output")
    val coalesce = commandLine.getOptionValue("coalesce")
    val today = commandLine.getOptionValue("today")


    val spark = SparkSession
      .builder()
      .appName("DmpEventTag")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "lz4")
      .config("spark.io.compression.lz4.blockSize", "64k")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.autoBroadcastJoinThreshold", "209715200")
      .config("spark.sql.broadcastTimeout", "1200")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output), true)

    try {

      //  kehan_test.adv_event_tab  dwh.adv_event_tab  kehan_test.dim_package_tags_combine
      val sql =
        s"""
           |select device_id,
           |uuid,
           |platform,
           |device_type,
           |country,
           |package_name,
           |event_name,
           |first_tag,
           |second_tag,
           |pursub,
           |install_time,
           |concat("${output}/",business)  path,
           |business,
           |dt update_date
           |from dwh.ods_dmp_event_org
           |where dt= '${today}'  and business in ('ss','3s','allpb')
           |group by
           |device_id,
           |uuid,
           |platform,
           |device_type,
           |country,
           |package_name,
           |event_name,
           |first_tag,
           |second_tag,
           |pursub,
           |install_time,
           |business,
           |dt
        """.stripMargin


      spark.sql(sql).filter(filterData(_)).rdd.mapPartitions(elements => {
        val buff = new ArrayBuffer[(Text, OrcStruct)]()
        // val scores2=scala.collection.mutable.Map[Text,OrcStruct]()
        //   scores2 += (new Text("1") -> struct)
        //  scores2.toIterator
        if (!elements.isEmpty) {
          elements.foreach(row => {

            val event_name = row.getAs[String]("event_name")
            val first_tag = row.getAs[String]("first_tag")
            // val category_id =  row.getAs[String]("category_id")
            val pursub = row.getAs[String]("pursub")
            val path = row.getAs[String]("path")
            val business = row.getAs[String]("business")

            var behavior_tag = ""
            if (business.equalsIgnoreCase("ga")) {
              buff += new Tuple2(new Text(path), genRow(row, pursub, schema))
            } else {
              if (!StringUtils.isBlank(first_tag) && "Games".equalsIgnoreCase(first_tag) && "1".equalsIgnoreCase(pursub)) {
                behavior_tag = "game_purchase"
                buff += new Tuple2(new Text(path), genRow(row, behavior_tag, schema)) //buff += (new Text("1"),struct)
              }

              if (!StringUtils.isBlank(first_tag) && "Games".equalsIgnoreCase(first_tag) && !"1".equalsIgnoreCase(pursub) && !StringUtils.isBlank(event_name) && isPaymentKeyWord(event_name)) {
                behavior_tag = "game_purchase"
                buff += new Tuple2(new Text(path), genRow(row, behavior_tag, schema))
              }

              if (!StringUtils.isBlank(first_tag) && "Games".equalsIgnoreCase(first_tag) && !"1".equalsIgnoreCase(pursub) && !StringUtils.isBlank(event_name) && !isPaymentKeyWord(event_name) && isActiveKeyWord(event_name)) {
                behavior_tag = "game_active"
                buff += new Tuple2(new Text(path), genRow(row, behavior_tag, schema))
              }

              if (!StringUtils.isBlank(first_tag) && "Shopping".equalsIgnoreCase(first_tag) && "1".equalsIgnoreCase(pursub)) {
                behavior_tag = "shopping_purchase"
                buff += new Tuple2(new Text(path), genRow(row, behavior_tag, schema))
              }

              if (!StringUtils.isBlank(first_tag) && "Shopping".equalsIgnoreCase(first_tag) && !"1".equalsIgnoreCase(pursub) && !StringUtils.isBlank(event_name) && isPaymentKeyWord(event_name)) {
                behavior_tag = "shopping_purchase"
                buff += new Tuple2(new Text(path), genRow(row, behavior_tag, schema))
              }

              if (!StringUtils.isBlank(first_tag) && "Shopping".equalsIgnoreCase(first_tag) && !"1".equalsIgnoreCase(pursub) && !StringUtils.isBlank(event_name) && !isPaymentKeyWord(event_name)) {
                behavior_tag = "shopping_active"
                buff += new Tuple2(new Text(path), genRow(row, behavior_tag, schema))
              }

              if (!StringUtils.isBlank(first_tag) && "Travel".equalsIgnoreCase(first_tag)) {
                behavior_tag = "travel_active"
                buff += new Tuple2(new Text(path), genRow(row, behavior_tag, schema))
              }
            }
          })
        }
        buff.iterator
      }).coalesce(coalesce.toInt)
        .saveAsNewAPIHadoopFile(
        s"""${output}/allpb""", classOf[Text], classOf[OrcStruct], classOf[MultipleOrcOutputFormat[Text, OrcStruct]],
        initConfig(spark.sparkContext.hadoopConfiguration))

    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  def filterData(line: Row): Boolean = {

    val device_id = line.getAs[String]("device_id")
    if (StringUtils.isNotBlank(device_id) && device_id.matches(didPtn) && !allZero.equals(device_id)) {
      return true
    }

    if (StringUtils.isNotBlank(device_id) && device_id.matches(imeiPtn)) {
      return true
    }

    if (StringUtils.isNotBlank(device_id) && device_id.matches(andriodIdPtn) ) {
      return true
    }
    return false
  }

  def initConfig(conf: Configuration): Configuration = {
    conf.set("orc.mapred.output.schema", schema)
    conf.setBoolean("mapreduce.output.compress", true)
    conf.set("mapreduce.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true)
    conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec")
    conf.set("orc.compress", "ZLIB")
    conf
  }


  def isPurSubKeyWord(category_id: String): Boolean = {
    if ("6".equalsIgnoreCase(category_id) || "47".equalsIgnoreCase(category_id)) {
      return true
    } else {
      return false
    }
  }

  def isPaymentKeyWord(words: String): Boolean = {
    val paymentKeyWords = Array("buy", "iap", "monetization", "subscription", "pay", "paid", "purchase", "revenue", "sale", "subscription", "reservation", "transaction", "book", "order")

    for (word <- paymentKeyWords) {
      if (words.indexOf(word) != -1 && words.indexOf("cancel") == -1 && words.indexOf("fail") == -1) {
        return true
      }
    }
    false
  }


  def isActiveKeyWord(words: String): Boolean = {
    val activeKeyWords = Array("account", "achieve", "active", "ad", "add", "clear", "click", "complete", "confirm", "consume", "create", "day", "done", "end", "enter", "event", "game", "level", "lv", "mode", "pvp", "retention", "reward", "share", "show", "start", "success", "theme", "time", "tutorial", "update", "video", "view", "watch", "wish")

    for (word <- activeKeyWords) {
      if (words.indexOf(word) != -1) {
        return true
      }
    }
    false
  }

  def genRow(row: Row, behavior_tag: String, schema: String): OrcStruct = {
    val first_tag = row.getAs[String]("first_tag")
    var second_tag = ""
    if (!StringUtils.isBlank(first_tag) && "Games".equalsIgnoreCase(first_tag)) {
      second_tag = row.getAs[String]("second_tag")
    }
    val struct = OrcStruct.createValue(TypeDescription.fromString(schema)).asInstanceOf[OrcStruct]
    struct.setFieldValue(0, new Text(row.getAs[String]("device_id")))
    struct.setFieldValue(1, new Text(row.getAs[String]("uuid")))
    struct.setFieldValue(2, new Text(row.getAs[String]("platform")))
    struct.setFieldValue(3, new Text(row.getAs[String]("device_type")))
    struct.setFieldValue(4, new Text(row.getAs[String]("country")))
    struct.setFieldValue(5, new Text(row.getAs[String]("package_name")))
    struct.setFieldValue(6, new Text(first_tag))
    struct.setFieldValue(7, new Text(second_tag))
    struct.setFieldValue(8, new Text(behavior_tag))
    struct.setFieldValue(9, new Text(row.getAs[String]("update_date")))
    struct
  }


  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("today", true, "[must] today")
    options.addOption("coalesce", true, "[must] coalesce")
    options.addOption("output", true, "[must] output")
    options
  }

}

object DmpEventTag {
  def main(args: Array[String]): Unit = {
    new DmpEventTag().run(args)
  }
}

