package mobvista.dmp.datasource.address

import java.util

import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.format.RCFileInputFormat
import mobvista.dmp.function.FunctionManager
import mobvista.dmp.util.{BytesRefUtil, DateUtil}
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.{Dataset, Row, SparkSession}

import scala.collection.mutable.ArrayBuffer

/**
  * 从每天请求日志中抽取，ip、经纬度、国家等信息
  * 请求记录按照十分钟值内，各项值相同的数据进行去重，取第一次出现的数据
  *
  */
class AddressInfoTotal extends CommonSparkJob with Serializable {

  val indexSplit = ","

  override protected def run(args: Array[String]): Int = {
    options.addOption("dailyFormat", true, "[must]")
    options.addOption("indices", true, "[must]")
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return 1
    } else {
      printOptions(commandLine)
    }

    val date = commandLine.getOptionValue("date")
    val input = commandLine.getOptionValue("input")
    val output = commandLine.getOptionValue("output")
    val dailyFormat = commandLine.getOptionValue("dailyFormat")
    val parallelism = commandLine.getOptionValue("parallelism").toInt
    val coalesce = commandLine.getOptionValue("coalesce").toInt
    val indices = commandLine.getOptionValue("indices")


    val spark = SparkSession.builder()
      .appName("AddressInfoTotal")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "snappy")
      .config("spark.default.parallelism", parallelism)
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .getOrCreate()

    import spark.implicits._
    val sc = spark.sparkContext

    try {
      var dailyDS: Dataset[AddressVO] = null
      if ("orc".equalsIgnoreCase(dailyFormat)) {
        dailyDS = spark.read.format("orc").load(input)
          .map(parseORC(_, indices))

      } else if ("rcfile".equalsIgnoreCase(dailyFormat)) {
        dailyDS = sc.newAPIHadoopFile[LongWritable, BytesRefArrayWritable, RCFileInputFormat[LongWritable, BytesRefArrayWritable]](input)
          .map(tuple => parseRCFile(tuple._2, indices))
          .toDS()
      } else {
        dailyDS = sc.textFile(input)
          .map(parseText(_, indices))
          .toDS()
      }

      // 处理天数据
      dailyDS
        .filter($"device_id".rlike(didPtn) && $"device_id".notEqual(allZero))
        .sortWithinPartitions($"req_time".asc)
        .groupByKey(vo => {
          s"${vo.device_id}${DATA_SPLIT}${vo.platform}"
        })
        .flatMapGroups((key, itr) => doMapGroups(key, itr))
        .orderBy($"device_id".asc, $"req_time".asc)
        .write
        .option("orc.compress", "zlib")
        .orc(output)
    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }


  /**
    *
    * @param key
    * @param itr
    * @return
    */
  def doMapGroups(key: String, itr: Iterator[AddressVO]): Array[AddressVO] = {
      var lastVO: AddressVO = null
      val arrayBuffer = new ArrayBuffer[AddressVO]()

      itr.foreach(vo => {
        if (lastVO == null) {
          lastVO = vo
          arrayBuffer += vo
        } else {
          if (isOutTenMin(vo.req_time, lastVO.req_time)) {
            lastVO = vo
            arrayBuffer += vo
          }
        }
      })
      arrayBuffer.toArray
  }

  /**
    *
    * @param first
    * @param second
    * @return
    */
  def isOutTenMin(first: String, second: String): Boolean = {
    val firstM = DateUtil.parse(first, "yyyyMMdd HH:mm:ss").getTime
    val secondM = DateUtil.parse(second, "yyyyMMdd HH:mm:ss").getTime
    (firstM - secondM) > 600000l
  }

  /**
    *
    * @param row
    * @param indices
    * @return
    */
  def parseORC(row: Row, indices: String): AddressVO = {
    val idxSplits = splitFun(indices, indexSplit)
    val deviceId = row.getString(idxSplits(0).toInt)
    val deviceType = row.getString(idxSplits(1).toInt)
    val platform = row.getString(idxSplits(2).toInt)
    val country = row.getString(idxSplits(3).toInt)
    null
  }

  /**
    *
    * @param value
    * @param indices
    * @return
    */
  def parseRCFile(value: BytesRefArrayWritable, indices: String): AddressVO = {
    val idxSplits = splitFun(indices, indexSplit)
    val deviceId = BytesRefUtil.BytesRefWritableToString(value.get(idxSplits(0).toInt))
    val platform = BytesRefUtil.BytesRefWritableToString(value.get(idxSplits(1).toInt))
    val req_time = BytesRefUtil.BytesRefWritableToString(value.get(idxSplits(2).toInt))
    val ip = BytesRefUtil.BytesRefWritableToString(value.get(idxSplits(3).toInt))
    val country = BytesRefUtil.BytesRefWritableToString(value.get(idxSplits(4).toInt))
    new AddressVO(deviceId, platform, req_time, ip, country)
  }

  /**
    *
    * @param line
    * @param indices
    * @return
    */
  def parseText(line: String, indices: String): AddressVO = {
    val env = new util.HashMap[String, Object]()
    env.put("line", line)
    env.put("indices", indices)
    FunctionManager.getExpression(FunctionManager.EXP_BUILD_ADDRESSVO_INPUT_INDICES)
      .execute(env).asInstanceOf[AddressVO]
  }

}

case class AddressVO (device_id: String, platform: String, req_time: String, ip: String, var country: String, var province: String, var city: String, longitude: String, latitude: String, other: String) {
  def this(device_id: String, platform: String, req_time: String, ip: String, country: String) = {
    this(device_id, platform, req_time, ip, country, "", "", "", "", "")
  }

  def this(device_id: String, platform: String, req_time: String, ip: String, longitude: String, latitude: String) = {
    this(device_id, platform, req_time, ip, "", "", "", longitude, latitude, "")
  }
}

object AddressInfoTotal {
  def main(args: Array[String]): Unit = {
    new AddressInfoTotal().run(args)
  }
}