package mobvista.dmp.datasource.iqiyi

import java.net.URI

import mobvista.dmp.common.{CommonSparkJob, MobvistaConstant}
import mobvista.dmp.util.DateUtil
import org.apache.commons.cli.{BasicParser, Options}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.rdd.RDD

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

/**
  * @package: mobvista.dmp.datasource.iqiyi
  * @author: wangjf
  * @date: 2020/9/9
  * @time: 10:36 上午
  * @email: jinfeng.wang@mobvista.com
  * @phone: 152-1062-7698
  */
class IQiYiLaHuoDaily extends CommonSparkJob with Serializable {

  def commandOptions(): Options = {
    val options = new Options()
    options.addOption("date", true, "date")
    options.addOption("days", true, "days")
    options.addOption("output", true, "output")
    options
  }

  override protected def run(args: Array[String]): Int = {
    val parser = new BasicParser()
    val options = commandOptions()
    val commandLine = parser.parse(options, args)
    val date = commandLine.getOptionValue("date")
    val days = commandLine.getOptionValue("days")
    val output = commandLine.getOptionValue("output")

    val spark = MobvistaConstant.createSparkSession(s"IQiYiLaHuoDaily.$date")
    import spark.implicits._

    try {

      val sc = spark.sparkContext

      val last_req_day = DateUtil.format(DateUtil.getDay(date, "yyyyMMdd", -days.toInt), "yyyy-MM-dd")

      FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)

      val begin_day = DateUtil.format(DateUtil.getDay(date, "yyyyMMdd", -2), "yyyyMMdd")
      val oldData: RDD[String] = spark.sql(old_data_sql.replace("@begin_day", begin_day).replace("@end_day", date)).rdd.map(_.mkString)

      val subtractRdd: RDD[String] = spark.sql(sql.replace("@date", date).replace("@last_req_day", last_req_day)).repartition(500).rdd.distinct().map(_.mkString).subtract(oldData)

      var resultRdd: RDD[String] = spark.sparkContext.emptyRDD[String]

      val num: Int = subtractRdd.count().toInt
      val lahuo_data_size =350000000
      if(num>=lahuo_data_size){
        println("number is enough, number is "+num)
        resultRdd=subtractRdd.toDF.limit(lahuo_data_size).rdd.map(_.mkString)
      }else{
        println("number is not enough,after deduplication number is "+num)
        resultRdd=subtractRdd
          .union(oldData.toDF.limit(lahuo_data_size-num).rdd.map(_.mkString))
      }

      resultRdd.repartition(500)
        .mapPartitions(rs => {
          val array = new ArrayBuffer[String]()
          var devidSet = new mutable.HashSet[String]()
          while (rs.hasNext) {
            devidSet.add(rs.next())
            if (devidSet.size == 50) {
              array += devidSet.mkString(",")
              devidSet = new mutable.HashSet[String]()
            } else if (devidSet.size < 50 && !rs.hasNext) {
              array += devidSet.mkString(",")
            }
          }
          array.iterator
        }).coalesce(100).saveAsTextFile(output, classOf[GzipCodec])

    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  val sql =
    s"""
       |SELECT CASE WHEN device_type = 'imei' THEN MD5(device_id) WHEN device_type = 'imeimd5' THEN device_id END AS device_id
       |  FROM dwh.ods_dmp_user_info
       |  WHERE dt ='@date' AND device_type IN ('imei','imeimd5')
       |  AND last_req_day >= '@last_req_day' AND UPPER(country) = 'CN'
        """.stripMargin

  //  爱奇艺拉活当天的结果数据存在dwh.iqiyi_lahuo_tmp_daily_to_s3  business='iqiyi_activation'当天的时间分区dt中，当天开始拉活时，要拉活数据与前三天结果数据去重，再去拉活
  val old_data_sql =
    s"""
       |select distinct device_id from dwh.iqiyi_lahuo_tmp_daily_to_s3 where dt>='@begin_day' and dt<='@end_day' and business='iqiyi_activation'
       |""".stripMargin

}

object IQiYiLaHuoDaily {
  def main(args: Array[String]): Unit = {
    new IQiYiLaHuoDaily().run(args)
  }
}