package mobvista.dmp

import java.net.URI

import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.Map


/**
  * @Author: xiaobin.yan
  * @Date: 20/07/2017
  * @Description:计算兴趣标签每日增量数据
  */
object DmpInterestTagDailyJob {

  val APP_NAME = "dm_interest_tag_daily"

  val COMMA_DELIMITER = ","

  val TAB_DELIMITER = "\t"

  /**
    * 入口
    *
    * @param args
    */
  def main(args: Array[String]): Unit = {
    if (args.length != 3) {
      println(args.mkString("<", ":", ">"))
      printErrInfo("Usage: DmpInterestTagDailyJob <Comma Separated DailyPaths> <Comma Separated InterestTagPaths> <outputPath>")
      System.exit(1)
    }
    val Array(dailyPaths, interestTagPaths, outputPath) = args
    println(s"The dailyPaths to be processed is : $dailyPaths")
    println(s"The interestTagPaths to be processed is : $interestTagPaths")
    val conf = new SparkConf()
      .setAppName(APP_NAME)
      .set("spark.task.maxFailures", "10")
      .set("spark.shuffle.io.retryWait", "5")
      .set("spark.shuffle.io.maxRetries", "20")
      .set("spark.stage.maxConsecutiveAttempts", "10")

    val sc = new SparkContext(conf)

    FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(outputPath), true)
    val dailyRDD = sc.textFile(dailyPaths).mapPartitions(iter => {
      iter.map(line => {
        val fields = line.split(TAB_DELIMITER)
        fields(0) + TAB_DELIMITER + fields(1)
      })
    }).distinct(1500)
    val interestTagRDD = sc.textFile(interestTagPaths).map(line => {
      val fields = line.split(TAB_DELIMITER)
      (fields(0) + TAB_DELIMITER + fields(1), (fields(2), fields(3)))
    })
    interestTagRDD.join(dailyRDD.map(x => (x, 1)), 1500)
      .reduceByKey((v1, v2) => {
        val tag1 = v1._1._2
        val tag2 = v2._1._2
        val json = mergeJsonArray(JSON.parseArray(tag1), JSON.parseArray(tag2))
        ((v1._1._1, json), 1)
      }).map(e => {
      val deviceKey: String = e._1
      val platform: String = e._2._1._1
      val tag: String = e._2._1._2
      s"$deviceKey$TAB_DELIMITER$platform$TAB_DELIMITER$tag"
    }).saveAsTextFile(outputPath, classOf[GzipCodec])
    sc.stop()
  }

  /**
    * 合并兴趣标签
    */
  def mergeJsonArray(jsonArray1: JSONArray, jsonArray2: JSONArray): _root_.scala.Predef.String = {
    val appMap: Map[String, JSONArray] = Map()
    iteratorJsonArray(jsonArray1, appMap)
    iteratorJsonArray(jsonArray2, appMap)
    val jsonArray = new JSONArray
    appMap.foreach(e => {
      val json = new JSONObject
      json.put("package_name", e._1)
      json.put("tag", e._2)
      jsonArray.add(json)
    })
    jsonArray.toJSONString
  }

  /**
    * 设备id校验正则表达式
    */
  val DEVICE_REGEX = "[0-9a-zA-Z]{8}-[0-9a-zA-Z]{4}-[0-9a-zA-Z]{4}-[0-9a-zA-Z]{4}-[0-9a-zA-Z]{12}"

  /**
    * 迭代应用标签数组
    */
  def iteratorJsonArray(jsonArray: JSONArray, appMap: Map[String, JSONArray]): Unit = {
    for (i <- 0 until jsonArray.size()) {
      val json = jsonArray.getJSONObject(i)
      val packageName = json.getString("package_name")
      val tagArray = json.getJSONArray("tag")
      appMap.get(packageName) match {
        case Some(tags) => {
          val tagMap: Map[String, JSONObject] = Map()
          iteratorTag(tagArray, tagMap)
          iteratorTag(tags, tagMap)
          val newJSONArray = new JSONArray
          tagMap.values.foreach(jsonObject => newJSONArray.add(jsonObject))
          appMap += (packageName -> newJSONArray)
        }
        case None => appMap += (packageName -> tagArray)
      }
    }
  }

  /**
    * 迭代标签数组
    */
  def iteratorTag(tags: JSONArray, tagMap: Map[String, JSONObject]) = {
    for (i <- 0 until tags.size) {
      val tagJson = tags.getJSONObject(i)
      val tagId = tagJson.getString("id")
      if (!tagMap.contains(tagId)) {
        tagMap += (tagId -> tagJson)
      }
    }
  }

  /**
    * 标准错误输出信息
    *
    * @param errInfo 需要输出的错误信息
    */
  def printErrInfo(errInfo: String): Unit = {
    System.err.println(errInfo)
  }
}
