Old2NewInstallList.scala 4.04 KB
Newer Older
wang-jinfeng committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
package mobvista.dmp.common

import mobvista.dmp.common.MobvistaConstant.deviceTypeSet
import mobvista.dmp.util.DateUtil
import org.apache.commons.cli.{BasicParser, Options}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.{SaveMode, SparkSession}

import java.net.URI

/**
 * @package: mobvista.dmp.common
 * @author: wangjf
 * @date: 2020/4/9
 * @time: 2:39 下午
 * @email: jinfeng.wang@mobvista.com
 * @phone: 152-1062-7698
 */
class Old2NewInstallList extends CommonSparkJob with Serializable {

  def commandOptions(): Options = {
    val options = new Options()
    options.addOption("date", true, "date")
    options.addOption("business", true, "business")
    options.addOption("output", true, "output")
    options.addOption("coalesce", true, "coalesce")
    options
  }

  override protected def run(args: Array[String]): Int = {
    val parser = new BasicParser()
    val options = commandOptions()
    val commandLine = parser.parse(options, args)
    val date = commandLine.getOptionValue("date")
    val business = commandLine.getOptionValue("business")
    val output = commandLine.getOptionValue("output")
    val coalesce = commandLine.getOptionValue("coalesce")

    val spark = SparkSession
      .builder()
      .appName(s"Old2NewInstallList.$date.$business")
      .config("spark.rdd.compress", "true")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.io.compression.codec", "snappy")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    try {
      val update_date = DateUtil.format(DateUtil.getDay(date, "yyyyMMdd", 0), "yyyy-MM-dd")

      FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output), true)

      spark.udf.register("toJsonBySplit", mobvista.dmp.common.MobvistaConstant.toJsonBySplitV1 _)
      spark.udf.register("check_device", mobvista.dmp.common.MobvistaConstant.checkDeviceId _)

      import spark.implicits._
      val df = spark.sql(sql.replace("@dt", date).replace("@update_date", update_date).replace("@business", business))
        .rdd
        .mapPartitions(ir => {
          ir.map(r => {
            var deviceType = r.getAs[String]("device_type")
            if (deviceType.equalsIgnoreCase("android_id") || deviceType.equalsIgnoreCase("androidid")) {
              deviceType = "androidid"
            }
            var platform = r.getAs[String]("platform")
            if (platform.equalsIgnoreCase("android") || platform.equalsIgnoreCase("android2") || platform.equalsIgnoreCase("adr")) {
              platform = "android"
            } else if (platform.equalsIgnoreCase("ios") || platform.equalsIgnoreCase("ios2")) {
              platform = "ios"
            }
            DmpInstallList(device_id = r.getAs[String]("device_id"), device_type = deviceType, platform = platform,
              country = r.getAs[String]("country"), install_list = r.getAs[String]("install_list"),
              ext_data = r.getAs[String]("ext_data"), update_date = r.getAs[String]("update_date"))
          })
        })

      df.filter(install => {
        deviceTypeSet.contains(install.device_type)
      }).toDF.repartition(coalesce.toInt)
        .write
        .mode(SaveMode.Overwrite)
        .option("orc.compress", "snappy")
        .orc(output)

    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  val sql =
    """
      |SELECT device_id, device_type, platform, '' country, toJsonBySplit(CONCAT_WS(';',COLLECT_SET(CONCAT(package_name, "#", update_date))), platform, '@update_date') install_list,
      | '{}' ext_data, COALESCE(MAX(update_date), '@update_date') update_date
      | FROM dwh.dm_install_list_v2 WHERE `dt` = '@dt' AND business = '@business' AND check_device(device_id)
      | GROUP BY device_id, device_type, platform
      |""".stripMargin
}

object Old2NewInstallList {
  def main(args: Array[String]): Unit = {
    new Old2NewInstallList().run(args)
  }
}