init id_mapping

a3f58be6 · WangJinfeng · b1f36887 · a3f58be6 · a3f58be6 · a3f58be6
Commit a3f58be6 authored Dec 27, 2021 by WangJinfeng
16 changed files
--- a/azkaban/dmp_env.sh
+++ b/azkaban/dmp_env.sh
@@ -437,6 +437,8 @@ DSP_DEVICE_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/dsp/device

 ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwd/dwd_device_ids_inc_daily"

+ADS_DEVICE_MID_ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/ads/ads_device_mid_id_mapping"
+
 ADS_DEVICE_ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/ads/ads_device_id_mapping"

 JAR=./DMP.jar

--- a/azkaban/id_mapping/dsp_id_mapping.sh
+++ b/azkaban/id_mapping/dsp_id_mapping.sh
@@ -19,8 +19,8 @@ spark-submit --class mobvista.dmp.datasource.id_mapping.DspReq \
  --name "EtlDeviceIdDaily.$BUSINESS.$LOG_TIME" \
  --conf spark.yarn.executor.memoryOverhead=2048 \
  --conf spark.network.timeout=720s \
-  --conf spark.sql.shuffle.partitions=10000 \
-  --conf spark.default.parallelism=10000 \
+  --conf spark.sql.shuffle.partitions=20000 \
+  --conf spark.default.parallelism=20000 \
  --master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 8g --executor-cores 5 --num-executors 200 \
  ../${JAR} -date ${LOG_TIME} -business ${BUSINESS} -output ${OUTPUT_PATH} -coalesce 2000


--- a/azkaban/id_mapping/id_mapping.job
+++ b/azkaban/id_mapping/id_mapping.job
 type=command
-command=sh -x id_mapping.sh
\ No newline at end of file
+dependencies=id_mapping_overseas_android,id_mapping_cn_android
+command=echo "id_mapping job end!"
\ No newline at end of file
--- a/azkaban/id_mapping/id_mapping.sh
+++ b/azkaban/id_mapping/id_mapping.sh
@@ -2,7 +2,11 @@

 source ../dmp_env.sh

-LOG_TIME=$(date +%Y-%m-%d -d "-1 day $ScheduleTime")
+COUNTRY=$1
+
+PLATFORM=$2
+
+LOG_TIME=$(date +%Y%m%d -d "-1 day $ScheduleTime")

 date_path=$(date +'%Y/%m/%d' -d "-1 day $ScheduleTime")

@@ -10,28 +14,22 @@ ADN_REQUEST_INPUT_PATH=${ID_MAPPING}/${date_path}/adn_request

 DSP_INPUT_PATH=${ID_MAPPING}/${date_path}/dsp_req

-check_await "${ADN_REQUEST_INPUT_PATH}/_SUCCESS"
-
-check_await "${DSP_INPUT_PATH}/_SUCCESS"
-
-OUTPUT_PATH=${ADS_DEVICE_MID_ID_MAPPING}/${date_path}
-
-RESULT_OUTPUT_PATH=${ADS_DEVICE_ID_MAPPING}/${date_path}
+# check_await "${ADN_REQUEST_INPUT_PATH}/$PLATFORM/_SUCCESS"

-country="US"
+# check_await "${DSP_INPUT_PATH}/$PLATFORM/_SUCCESS"

-platform="ios"
+OUTPUT_PATH=${ADS_DEVICE_ID_MAPPING}/${date_path}/${COUNTRY}/${PLATFORM}

 spark-submit --class mobvista.dmp.datasource.id_mapping.IDMappingGraphx \
-  --name "IDMappingGraphx.${LOG_TIME}.${country}.${platform}" \
+  --name "IDMappingGraphx.${LOG_TIME}.${COUNTRY}.${PLATFORM}" \
  --conf spark.yarn.executor.memoryOverhead=2048 \
  --conf spark.network.timeout=720s \
  --conf spark.sql.shuffle.partitions=10000 \
  --conf spark.default.parallelism=10000 \
-  --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
-  --master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 8g --executor-cores 5 --num-executors 100 \
-  ../${JAR} -date ${LOG_TIME} -country ${country} -platform ${platform} -output ${OUTPUT_PATH} -result_output ${RESULT_OUTPUT_PATH} -coalesce 500
+  --master yarn --deploy-mode cluster --executor-memory 10g --driver-memory 6g --executor-cores 5 --num-executors 200 \
+  ../${JAR} -date ${LOG_TIME} -country ${COUNTRY} -platform ${PLATFORM} \
+  -output ${OUTPUT_PATH}/mid -fre_output ${OUTPUT_PATH}/frequency -result_output ${OUTPUT_PATH}/result -coalesce 1000

 if [ $? -ne 0 ]; then
  exit 255
-fi
\ No newline at end of file
+fi
--- a/azkaban/id_mapping/id_mapping_cn_android.job
+++ b/azkaban/id_mapping/id_mapping_cn_android.job
+type=command
+dependencies=id_mapping_cn_ios
+command=sh -x id_mapping.sh 'cn' 'android'
\ No newline at end of file
--- a/azkaban/id_mapping/id_mapping_cn_ios.job
+++ b/azkaban/id_mapping/id_mapping_cn_ios.job
+type=command
+
+command=sh -x id_mapping.sh 'cn' 'ios'
\ No newline at end of file
--- a/azkaban/id_mapping/id_mapping_overseas_android.job
+++ b/azkaban/id_mapping/id_mapping_overseas_android.job
+type=command
+dependencies=id_mapping_overseas_ios
+command=sh -x id_mapping.sh 'overseas' 'android'
\ No newline at end of file
--- a/azkaban/id_mapping/id_mapping_overseas_ios.job
+++ b/azkaban/id_mapping/id_mapping_overseas_ios.job
+type=command
+command=sh -x id_mapping.sh 'overseas' 'ios'
\ No newline at end of file
--- a/azkaban/realtime/dm_realtime_service.sh
+++ b/azkaban/realtime/dm_realtime_service.sh
@@ -32,7 +32,6 @@ unmount_output_path="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/dm_us

 spark-submit --class mobvista.dmp.datasource.retargeting.DeviceInfoJob \
     --name "DeviceInfoJob.wangjf.${date}" \
-     --conf spark.sql.broadcastTimeout=1200 \
     --conf spark.sql.shuffle.partitions=10000 \
     --conf spark.default.parallelism=10000 \
     --conf spark.kryoserializer.buffer.max=512m \
@@ -40,7 +39,7 @@ spark-submit --class mobvista.dmp.datasource.retargeting.DeviceInfoJob \
     --conf spark.sql.files.maxPartitionBytes=536870912 \
     --conf spark.sql.adaptive.enabled=true \
     --conf spark.sql.adaptive.advisoryPartitionSizeInBytes=536870912 \
-     --master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 10g --executor-cores 4 --num-executors 100 \
+     --master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 10g --executor-cores 4 --num-executors 150 \
     ../${JAR} \
     -date ${date} -output ${output_path} -coalesce 3000


--- a/src/main/scala/mobvista/dmp/datasource/datatory/Constant.scala
+++ b/src/main/scala/mobvista/dmp/datasource/datatory/Constant.scala
@@ -59,7 +59,7 @@ object Constant {
    """
      |SELECT b.device_id, UPPER(country) country, CAST(b.offer_id AS string) offer_id, COALESCE(a.id, b.event_name) id, COALESCE(a.event_name, b.event_name) event_name, COALESCE(a.event_type,'') event_type FROM
      | (SELECT devid device_id, MAX(country) country, event_name, uuid offer_id FROM dwh.ods_3s_trackingcsv_event_info
-      |   WHERE yyyy = '@year' and mm = '@month' and dd = '@day' AND devid IS NOT NULL AND devid <> '' GROUP BY devid, event_name, uuid) b
+      |   WHERE yyyymmdd = '@date' AND devid IS NOT NULL AND devid <> '' GROUP BY devid, event_name, uuid) b
      | LEFT JOIN
      | (SELECT CAST(id AS string) id, event_name, event_type, offer_id FROM dwh.ods_3s_trackingcsv_event_define WHERE yyyymmdd = '@date') a
      | ON a.offer_id = b.offer_id

--- a/src/main/scala/mobvista/dmp/datasource/datatory/TrackingEventDaily.scala
+++ b/src/main/scala/mobvista/dmp/datasource/datatory/TrackingEventDaily.scala
@@ -75,13 +75,7 @@ class TrackingEventDaily extends CommonSparkJob with java.io.Serializable {

      FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)

-      val year = date.substring(0, 4)
-      val month = date.substring(4, 6)
-      val day = date.substring(6, 8)
-
-      var sql = Constant.tracking_event_sql.replace("@year", year)
-        .replace("@month", month)
-        .replace("@day", day)
+      var sql = Constant.tracking_event_sql.replace("@date", date)

      spark.sql(sql)
        .filter(r => {

--- a/src/main/scala/mobvista/dmp/datasource/id_mapping/Constant.scala
+++ b/src/main/scala/mobvista/dmp/datasource/id_mapping/Constant.scala
@@ -115,7 +115,7 @@ object Constant {
    StructField("xwho", StringType),
    StructField("user_id", StringType),
    StructField("bkupid", StringType),
-    StructField("cnt", IntegerType)
+    StructField("cnt", LongType)
  ))

  val androidCNIDSet = Array("imei", "oaid", "gaid", "sysid", "xwho", "user_id", "android_pkg", "bmosv_upt", "bmosv_ipua_pkg", "bkupid")
@@ -223,7 +223,7 @@ object Constant {
  val ios_id_mapping_sql: String =
    """
      |SELECT idfa, idfv, pkg_name, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt, count(1) cnt
-      |  FROM dwd.dwd_device_ios_ids_inc_daily WHERE dt = '@date'
+      |  FROM dwd.dwd_device_ios_ids_inc_daily WHERE dt = '@date' @filter_country
      |  GROUP BY idfa, idfv, pkg_name, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt
      |""".stripMargin

@@ -231,7 +231,7 @@ object Constant {
    """
      |SELECT imei, android_id, pkg_name, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt, count(1) cnt
      |  FROM dwd.dwd_device_android_ids_inc_daily WHERE dt = '@date' @filter_country
-      |  GROUP BY imei, android_id, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt
+      |  GROUP BY imei, android_id, pkg_name, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt
      |""".stripMargin

  val old_id_mapping_sql: String =

--- a/src/main/scala/mobvista/dmp/datasource/id_mapping/DspReq.scala
+++ b/src/main/scala/mobvista/dmp/datasource/id_mapping/DspReq.scala
@@ -22,13 +22,17 @@ class DspReq extends EtlDeviceIdDaily {
    //  ODS
    val hour = i match {
      case 0 =>
-        " AND hh BETWEEN '00' AND '05'"
+        " AND hh BETWEEN '00' AND '03'"
      case 1 =>
-        " AND hh BETWEEN '06' AND '11'"
+        " AND hh BETWEEN '04' AND '07'"
      case 2 =>
-        " AND hh BETWEEN '12' AND '17'"
+        " AND hh BETWEEN '08' AND '11'"
      case 3 =>
-        " AND hh BETWEEN '18' AND '23'"
+        " AND hh BETWEEN '12' AND '15'"
+      case 4 =>
+        " AND hh BETWEEN '16' AND '19'"
+      case 5 =>
+        " AND hh BETWEEN '20' AND '23'"
      case _ =>
        ""
    }

--- a/src/main/scala/mobvista/dmp/datasource/id_mapping/EtlDeviceIdDaily.scala
+++ b/src/main/scala/mobvista/dmp/datasource/id_mapping/EtlDeviceIdDaily.scala
@@ -40,10 +40,9 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {

    try {
      if ("dsp_req".equalsIgnoreCase(business)) {
-        for (i <- 0 until 4) {
+        for (i <- 0 until 6) {
          val df = processData(date, i, spark)
-            .repartition(5000)
-            .persist(StorageLevel.MEMORY_AND_DISK_SER)
+          df.persist(StorageLevel.MEMORY_AND_DISK_SER)

          val iosTab = df.filter(plf => {
            "ios".equals(plf._1)
@@ -53,7 +52,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
          FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/ios/${i}"), true)

          spark.createDataFrame(iosTab, iosSchema)
-            .coalesce(coalesce)
+            .repartition(coalesce)
            .write.mode(SaveMode.Overwrite)
            .option("orc.compress", "zlib")
            .orc(output + s"/ios/${i}")
@@ -67,7 +66,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
          FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/android/${i}"), true)

          spark.createDataFrame(adrTab, adrSchema)
-            .coalesce(coalesce)
+            .repartition(coalesce)
            .write.mode(SaveMode.Overwrite)
            .option("orc.compress", "zlib")
            .orc(output + s"/android/${i}")
@@ -81,7 +80,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
          FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/other/${i}"), true)

          spark.createDataFrame(otherTab, otherSchema)
-            .coalesce(coalesce)
+            .repartition(coalesce)
            .write.mode(SaveMode.Overwrite)
            .option("orc.compress", "zlib")
            .orc(output + s"/other/${i}")
@@ -90,8 +89,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
        }
      } else {
        val df = processData(date, 0, spark)
-          .repartition(5000)
-          .persist(StorageLevel.MEMORY_AND_DISK_SER)
+        df.persist(StorageLevel.MEMORY_AND_DISK_SER)

        val iosTab = df.filter(plf => {
          "ios".equals(plf._1)

--- a/src/main/scala/mobvista/dmp/datasource/id_mapping/IDMappingGraphx.scala
+++ b/src/main/scala/mobvista/dmp/datasource/id_mapping/IDMappingGraphx.scala
--- a/src/main/scala/mobvista/dmp/datasource/retargeting/DeviceInfoJob.scala
+++ b/src/main/scala/mobvista/dmp/datasource/retargeting/DeviceInfoJob.scala
@@ -57,7 +57,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
      .config("spark.sql.orc.filterPushdown", "true")
      .config("spark.io.compression.codec", "lz4")
      .config("spark.io.compression.lz4.blockSize", "64k")
-      .config("spark.sql.autoBroadcastJoinThreshold", "314572800")
+      .config("spark.sql.autoBroadcastJoinThreshold", "-1")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()