ComEgAndroidAlipayGphoneReyun.scala 5.1 KB
Newer Older
1 2 3 4 5 6 7
package mobvista.dmp.datasource.dm

import mobvista.dmp.common.CommonSparkJob
import mobvista.dmp.format.TextMultipleOutputFormat
import org.apache.commons.cli.Options
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.fs.{FileSystem, Path}
fan.jiang committed
8
import org.apache.hadoop.io.SequenceFile.CompressionType
9
import org.apache.hadoop.io.Text
fan.jiang committed
10
import org.apache.hadoop.io.compress.{CompressionCodec, GzipCodec}
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions.{concat_ws, lit}
import org.apache.spark.storage.StorageLevel

import java.net.URI
import scala.collection.mutable.ArrayBuffer

/**
 * @author jiangfan
 * @date 2021/8/19 14:17
 */

class ComEgAndroidAlipayGphoneReyun extends CommonSparkJob with Serializable {
  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("coalesce", true, "[must] coalesce")
28 29
    options.addOption("output1", true, "[must] output1")
    options.addOption("output2", true, "[must] output2")
30
    options.addOption("dt_today", true, "[must] dt_today")
fan.jiang committed
31
    options.addOption("update", true, "[must] update")
fan.jiang committed
32
    options.addOption("update02", true, "[must] update02")
33 34 35 36 37 38 39 40 41 42 43
    options
  }

  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return -1
    } else printOptions(commandLine)

    val coalesce = commandLine.getOptionValue("coalesce")
44 45
    val output1 = commandLine.getOptionValue("output1")
    val output2 = commandLine.getOptionValue("output2")
46
    val dt_today = commandLine.getOptionValue("dt_today")
fan.jiang committed
47
    val update = commandLine.getOptionValue("update")
fan.jiang committed
48
    val update02 = commandLine.getOptionValue("update02")
49 50 51 52 53 54 55 56 57 58 59 60 61 62

    val spark = SparkSession.builder()
      .appName("ComEgAndroidAlipayGphoneReyun")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "snappy")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    val sc = spark.sparkContext
    import spark.implicits._

63 64
    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output1), true)
    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output2), true)
65 66 67

    try {

fan.jiang committed
68 69 70 71 72 73
      val conf = spark.sparkContext.hadoopConfiguration
      conf.set("mapreduce.output.compress", "true")
      conf.set("mapreduce.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")
      conf.setBoolean("mapreduce.output.fileoutputformat.compress", true)
      conf.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.toString)
      conf.setClass("mapreduce.output.fileoutputformat.compress.codec", classOf[GzipCodec], classOf[CompressionCodec])
74

fan.jiang committed
75
      val sql1=
76
        s"""
fan.jiang committed
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
           |select
           |    device_id, device_type ,concat("[\\"",package_name,"_",business,"\\"]")  as  package_name
           |from
           |    dwh.dm_install_list_v2
           |where dt='${dt_today}'  and  business='reyun'  and  package_name='com.eg.android.AlipayGphone'
           |    and device_type  in ('imei','gaid','oaid','idfa','imeimd5','gaidmd5','oaidmd5','idfamd5')
           |    and  update_date >= "${update}"
           |union
           |select
           |    device_id, device_type ,concat("[\\"",package_name,"_",business,"\\"]")  as  package_name
           |from
           |    dwh.dm_install_list_v2
           |where dt='${dt_today}' and business in ('reyun','btop')
           |    and package_name in ('com.taobao.litetao','com.ss.android.ugc.aweme')
           |    and device_type  in ('imei','gaid','oaid','idfa','imeimd5','gaidmd5','oaidmd5','idfamd5')
           |    and  update_date >= "${update}"
fan.jiang committed
93 94 95 96 97 98 99 100
           |union
           |select
           |    device_id, device_type ,concat("[\\"",package_name,"_3","\\"]")  as  package_name
           |from
           |    dwh.dm_install_list_v2
           |where dt='${dt_today}' and business in ('dsp_req')
           |    and package_name in ('com.taobao.taobao_iqiyi')
           |    and  update_date >= "${update02}"
101 102
        """.stripMargin

fan.jiang committed
103
      val df01: DataFrame = spark.sql(sql1).persist(StorageLevel.MEMORY_AND_DISK_SER)
104

fan.jiang committed
105 106
      val data01 = df01.select(concat_ws("\t", df01.col("device_id"),  df01.col("device_type"),  lit("android"),df01.col("package_name")))
      val data01_with_country = df01.select(concat_ws("\t", df01.col("device_id"),  df01.col("device_type"),  lit("android"),lit("CN")))
107

fan.jiang committed
108 109
      data01.coalesce(coalesce.toInt).write.format("text").mode("overwrite").save(output1)
      data01_with_country.coalesce(coalesce.toInt).write.format("text").mode("overwrite").save(output2)
110 111 112 113 114 115 116 117 118 119 120 121 122 123
    } finally {
      spark.stop()
    }
    0
  }

}


object ComEgAndroidAlipayGphoneReyun {
  def main(args: Array[String]): Unit = {
    new ComEgAndroidAlipayGphoneReyun().run(args)
  }
}