init id_mapping

a7c11c2b · WangJinfeng · 228e128a · a7c11c2b
Commit a7c11c2b authored Dec 21, 2021 by WangJinfeng
Hide whitespace changes
Inline Side-by-side

Showing with 64 additions and 13 deletions

IDMappingGraphx.scala .../mobvista/dmp/datasource/id_mapping/IDMappingGraphx.scala +64 -13

No files found.
--- a/src/main/scala/mobvista/dmp/datasource/id_mapping/IDMappingGraphx.scala
+++ b/src/main/scala/mobvista/dmp/datasource/id_mapping/IDMappingGraphx.scala
 package mobvista.dmp.datasource.id_mapping
 import com.alibaba.fastjson.JSONObject
+import mobvista.dmp.common.MobvistaConstant.sdf1
 import mobvista.dmp.common.{CommonSparkJob, MobvistaConstant}
 import mobvista.dmp.datasource.id_mapping.Constant._
 import mobvista.dmp.util.MD5Util
 import org.apache.commons.cli.{BasicParser, Options}
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.{Row, SaveMode, SparkSession}
+import org.apache.spark.storage.StorageLevel
 import java.net.URI
 import scala.collection.JavaConverters._
@@ -57,7 +58,7 @@ class IDMappingGraphx extends CommonSparkJob with Serializable {
  def oldAndTodayIdMapping(country: String, platform: String, date: String, spark: SparkSession, outPutPath: String,
-                           edgeoutPutPath: String, coalesce: Int) = {
+                           resultOutPutPath: String, coalesce: Int) = {
    implicit val formats = org.json4s.DefaultFormats
@@ -65,6 +66,7 @@ class IDMappingGraphx extends CommonSparkJob with Serializable {
    var schame: StructType = null
    var idSet: Array[String] = null
    var idMainSet: Set[String] = null
+    var scoreMap: Map[String, Double] = null
    //  1.今日数据加载
    platform match {
      case "ios" =>
@@ -72,9 +74,11 @@ class IDMappingGraphx extends CommonSparkJob with Serializable {
        schame = iosVertSchema
        idSet = iosIDSet
        idMainSet = iosMainIDSet
+        scoreMap = iosIDScoreMap
      case "android" => {
        schame = adrVertSchema
        idMainSet = androidMainIDSet
+        scoreMap = androidIDScoreMap
        country match {
          case "CN" =>
            idSet = androidCNIDSet
@@ -127,29 +131,76 @@ class IDMappingGraphx extends CommonSparkJob with Serializable {
      ((srcID, idType), oneID.toJSONString)
    })
-    val mergeOneIDRDD = multiOneIDRDD.union(singleOneIDRDD).combineByKey(
+    val midMergeOneIDRDD = multiOneIDRDD.union(singleOneIDRDD).combineByKey(
      (v: String) => Set(v),
      (c: Set[String], v: String) => c ++ Seq(v),
      (c1: Set[String], c2: Set[String]) => c1 ++ c2
    ).map(kv => {
-      val srcId = kv._1._1
+      val srcId = if (kv._1._1.matches(MobvistaConstant.md5Ptn)) {
+        kv._1._1
+      } else {
+        MD5Util.getMD5Str(kv._1._1)
+      }
      val srcType = kv._1._2
-      val oneID = new JSONObject()
+      val oneIDJSON = new JSONObject()
      kv._2.foreach(js => {
        val json = MobvistaConstant.String2JSONObject(js)
        val keys = json.keySet().asScala
        keys.foreach(key => {
-          oneID.put(key, json.getJSONObject(key))
+          val oneID = if (key.matches(MobvistaConstant.md5Ptn)) {
+            key
+          } else {
+            MD5Util.getMD5Str(key)
+          }
+          oneIDJSON.put(oneID, json.getJSONObject(key))
        })
      })
-      (srcId, srcType, oneID)
+      Result(srcId, srcType, oneIDJSON.toJSONString)
-    })
+    }).persist(StorageLevel.MEMORY_AND_DISK_SER)
    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(outPutPath), true)
-    mergeOneIDRDD
+    import spark.implicits._
+    midMergeOneIDRDD.toDF
+      .repartition(coalesce)
+      .write.mode(SaveMode.Overwrite)
+      .option("orc.compress", "zlib")
+      .orc(outPutPath)
+    FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(resultOutPutPath), true)
+    val resultOneID = midMergeOneIDRDD.mapPartitions(rs => {
+      rs.map(r => {
+        val device_id = r.device_id
+        val device_type = r.device_type
+        val one_id = MobvistaConstant.String2JSONObject(r.one_id)
+        val keys = one_id.keySet().asScala
+        var oneIDScore: OneIDScore = OneIDScore("", "", 0)
+        keys.foreach(key => {
+          val id_type = one_id.getString("id_type")
+          val id_type_score = scoreMap(id_type)
+          val active_date = one_id.getString("active_date")
+          val cnt = one_id.getIntValue("cnt")
+          val days = (sdf1.parse(date).getTime - sdf1.parse(active_date).getTime) / 1000 / 3600 / 24 + 1
+          val score = id_type_score * 30 / days + 0.1 * cnt
+          if (idSet.indexOf(id_type) < idSet.indexOf(oneIDScore.one_type) || idSet.indexOf(oneIDScore.one_type) == -1
+            || (idSet.indexOf(id_type) == idSet.indexOf(oneIDScore.one_type) && score >= oneIDScore.one_score)) {
+            oneIDScore = OneIDScore(key, id_type, score)
+          }
+        })
+        val json = new JSONObject()
+        json.put("one_id", oneIDScore.one_id)
+        json.put("one_type", oneIDScore.one_type)
+        json.put("one_score", oneIDScore.one_score)
+        Result(device_id = device_id, device_type = device_type, one_id = json.toJSONString)
+      })
+    })
+    resultOneID
+      .toDF
      .repartition(coalesce)
-      .saveAsTextFile(outPutPath, classOf[GzipCodec])
+      .write.mode(SaveMode.Overwrite)
+      .option("orc.compress", "zlib")
+      .orc(resultOutPutPath)
  }
@@ -157,13 +208,13 @@ class IDMappingGraphx extends CommonSparkJob with Serializable {
    platform match {
      case "ios" =>
        var idfa = row.getAs[String]("idfa")
-        idfa = if (StringUtils.isNotBlank(idfa) && idfa.matches(didPtn) && !idfa.matches(allZero)) {
+        idfa = if (StringUtils.isNotBlank(idfa) && idfa.matches(MobvistaConstant.didPtn) && !idfa.matches(MobvistaConstant.allZero)) {
          idfa
        } else {
          ""
        }
        var idfv = row.getAs[String]("idfv")
-        idfv = if (StringUtils.isNotBlank(idfv) && idfv.matches(didPtn) && !idfv.matches(allZero)) {
+        idfv = if (StringUtils.isNotBlank(idfv) && idfv.matches(MobvistaConstant.didPtn) && !idfv.matches(MobvistaConstant.allZero)) {
          idfv
        } else {
          ""