UCTmpDataToDMP.scala 4.9 KB
Newer Older
wang-jinfeng committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
package mobvista.dmp.datasource.taobao

import java.net.URI

import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import mobvista.dmp.common.CommonSparkJob
import org.apache.commons.cli.Options
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

import scala.collection.mutable.ArrayBuffer


class UCTmpDataToDMP extends CommonSparkJob with Serializable {

  override protected def buildOptions(): Options = {
    val options = new Options
    options.addOption("imeiRequestInput", true, "[must] imeiRequestInput")
    options.addOption("oaidRequestInput", true, "[must] oaidRequestInput")
    options.addOption("imeiResponseInput", true, "[must] imeiResponseInput")
    options.addOption("oaidResponseInput", true, "[must] oaidResponseInput")
    options.addOption("imeiOutput", true, "[must] imeiOutput")
    options.addOption("oaidOutput", true, "[must] oaidOutput")
    options.addOption("imeiNotActivationOutput", true, "[must] imeiNotActivationOutput")
    options.addOption("oaidNotActivationOutput", true, "[must] oaidNotActivationOutput")
    options.addOption("update", true, "[must] update")
    options
  }

  private def buildRes(row: String,update: String,device_type: String): Array[String] = {
    val jsonData: JSONArray = JSON.parseArray(row)

    val deviceIds = ArrayBuffer[String]()
    for (i <- 0 until jsonData.size) {
      val nObject: JSONObject = jsonData.getJSONObject(i)
      val crowdCode = nObject.getString("crowdCode")
      val uids: JSONArray = nObject.getJSONArray("uids")
      for (i <- 0 until uids.size) {
        val deviceId: String = uids.getString(i)
        deviceIds += deviceId + "\t" + device_type + "\t" + "android" + "\t" + "com.uc.foractivation_"+ device_type.substring(0,4)+ "\t" + update
        deviceIds += deviceId + "\t" + device_type + "\t" + "android" + "\t" + "com.uc.foractivation"+"."+crowdCode.substring(0,6) + "\t" + update
      }
    }
    deviceIds.toArray
  }

  override protected def run(args: Array[String]): Int = {
    val commandLine = commParser.parse(options, args)
    if (!checkMustOption(commandLine)) {
      printUsage(options)
      return -1
    } else printOptions(commandLine)

    val imeiRequestInput = commandLine.getOptionValue("imeiRequestInput")
    val oaidRequestInput = commandLine.getOptionValue("oaidRequestInput")
    val imeiResponseInput = commandLine.getOptionValue("imeiResponseInput")
    val oaidResponseInput = commandLine.getOptionValue("oaidResponseInput")
    val imeiOutput = commandLine.getOptionValue("imeiOutput")
    val oaidOutput = commandLine.getOptionValue("oaidOutput")
    val imeiNotActivationOutput = commandLine.getOptionValue("imeiNotActivationOutput")
    val oaidNotActivationOutput = commandLine.getOptionValue("oaidNotActivationOutput")
    val update = commandLine.getOptionValue("update")

    val spark = SparkSession.builder()
      .appName("UCTmpDataToDMP")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "snappy")
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    val sc = spark.sparkContext
    import spark.implicits._

    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(imeiOutput), true)
    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(oaidOutput), true)

    try {
      val imeiRdd: RDD[String] = sc.textFile(imeiResponseInput).filter(_.contains("crowdCode")).flatMap(buildRes(_,update,"imeimd5"))
      val imeiRequestRdd: RDD[String] = sc.textFile(imeiRequestInput).flatMap( row => { row.split(",")})

      imeiRdd.coalesce(50).saveAsTextFile(imeiOutput, classOf[GzipCodec])

      imeiRequestRdd.subtract(imeiRdd.map( _.split("\t")(0)) ).map(row => {
        row + "\t" + "imeimd5" + "\t" + "android" + "\t" + "com.uc.notforactivation_imei" + "\t" + update
      }).coalesce(50).saveAsTextFile(imeiNotActivationOutput, classOf[GzipCodec])


      val oaidRdd: RDD[String] = sc.textFile(oaidResponseInput).filter(_.contains("crowdCode")).flatMap(buildRes(_,update,"oaidmd5"))
      val oaidRequestRdd: RDD[String] = sc.textFile(oaidRequestInput).flatMap( row => { row.split(",")})

      oaidRdd.coalesce(50).saveAsTextFile(oaidOutput, classOf[GzipCodec])

      oaidRequestRdd.subtract(oaidRdd.map( _.split("\t")(0))).map(row => {
        row + "\t" + "oaidmd5" + "\t" + "android" + "\t" + "com.uc.notforactivation_oaid" + "\t" + update
      }).coalesce(50).saveAsTextFile(oaidNotActivationOutput, classOf[GzipCodec])


    } finally {
      spark.stop()
    }
    0


  }

}


object UCTmpDataToDMP {
  def main(args: Array[String]): Unit = {
    new UCTmpDataToDMP().run(args)
  }
}