RTDmpMergeCK.scala 3.81 KB
Newer Older
wang-jinfeng committed
1 2 3 4
package mobvista.dmp.datasource.rtdmp

import com.alibaba.fastjson.JSON
import mobvista.dmp.common.{CommonSparkJob, MobvistaConstant}
5
import mobvista.dmp.util.{DateUtil, MySQLUtil}
wang-jinfeng committed
6 7 8 9 10
import mobvista.dmp.utils.clickhouse.ClickHouseConnectionFactory
import mobvista.dmp.utils.clickhouse.ClickHouseSparkExt._
import org.apache.commons.cli.{BasicParser, Options}
import ru.yandex.clickhouse.ClickHouseDataSource

WangJinfeng committed
11 12
import java.text.SimpleDateFormat
import java.util.Calendar
wang-jinfeng committed
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
import scala.collection.JavaConversions._
import scala.collection.mutable


/**
 * @package: mobvista.dmp.datasource.rtdmp
 * @author: wangjf
 * @date: 2020/8/17
 * @time: 10:57 上午
 * @email: jinfeng.wang@mobvista.com
 * @phone: 152-1062-7698
 */
class RTDmpMergeCK extends CommonSparkJob with Serializable {
  var expire_time = ""

  def commandOptions(): Options = {
    val options = new Options()
    options.addOption("date_time", true, "date_time")
    options.addOption("host", true, "host")
    options.addOption("cluster", true, "cluster")
    options.addOption("database", true, "database")
    options.addOption("table", true, "table")
    options
  }

  override protected def run(args: Array[String]): Int = {
    val parser = new BasicParser()
    val options = commandOptions()
    val commandLine = parser.parse(options, args)
    val date_time = commandLine.getOptionValue("date_time")
    val cluster = commandLine.getOptionValue("cluster")
    val host = commandLine.getOptionValue("host")
    val database = commandLine.getOptionValue("database")
    val table = commandLine.getOptionValue("table")

    val spark = MobvistaConstant.createSparkSession(s"RTDmpMergeCK.$date_time")

    try {
      expire_time = DateUtil.getDayByString(date_time, "yyyyMMddHH", -1)

WangJinfeng committed
53 54 55 56 57 58 59 60 61 62
      val tdf = spark.emptyDataFrame
      val sdf = new SimpleDateFormat("yyyyMMddHH")
      //  drop expire partition
      val calendar = Calendar.getInstance()
      var date = sdf.parse(date_time)
      calendar.setTime(date)
      calendar.set(Calendar.HOUR_OF_DAY, calendar.get(Calendar.HOUR_OF_DAY) - 6)
      val expire_part = sdf.format(calendar.getTime)
      var dt_part = expire_part.substring(0, 8)
      var hour_part = expire_part.substring(8, 10)
wang-jinfeng committed
63 64 65 66 67

      implicit val clickhouseDataSource: ClickHouseDataSource = ClickHouseConnectionFactory.get(host)

      val clusterName = Some(cluster): Option[String]

WangJinfeng committed
68
      tdf.dropPartition(database, table, s"($dt_part,'$hour_part')", clusterName)
wang-jinfeng committed
69

WangJinfeng committed
70
      spark.udf.register("process", process _)
wang-jinfeng committed
71

WangJinfeng committed
72 73 74 75 76 77 78 79
      val df = spark.sql(sql.replace("@dt", date_time))
        .filter("size(audience_id) > 0")

      dt_part = date_time.substring(0, 8)
      hour_part = expire_time.substring(8, 10)
      val dt = MobvistaConstant.sdf1.format(MobvistaConstant.sdf2.parse(dt_part))

      tdf.dropPartition(database, table, s"($dt_part,'$hour_part')", clusterName)
wang-jinfeng committed
80 81 82

      Thread.sleep(120000)

WangJinfeng committed
83
      df.saveToClickHouse(database, table, Seq(dt, hour_part), Seq("dt", "hour"), clusterName, batchSize = 200000)
wang-jinfeng committed
84

85 86
      MySQLUtil.update(database, table, date_time)

wang-jinfeng committed
87 88 89 90 91 92 93 94 95 96
    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  val sql =
    """
97 98
      |SELECT devid, process(audience_map) audience_id,
      |  CASE WHEN device_type IS NOT NULL AND device_type != '' THEN device_type ELSE 'unknown' END AS device_type
wang-jinfeng committed
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
      |  FROM dwh.audience_merge
      |  WHERE dt = '@dt'
      |""".stripMargin

  def process(audience_map: String): mutable.WrappedArray[Int] = {
    val set = new mutable.HashSet[Int]()
    val audienceMap = JSON.parseObject(audience_map).asInstanceOf[java.util.Map[String, String]]
    audienceMap.retain((_, v) => v.compareTo(expire_time) > 0).keys.foreach(k => {
      set.add(Integer.parseInt(k))
    })
    mutable.WrappedArray.make(set.toArray)
  }
}

object RTDmpMergeCK {
  def main(args: Array[String]): Unit = {
    new RTDmpMergeCK().run(args)
  }
}