Commit a3f58be6 by WangJinfeng

init id_mapping

parent b1f36887
...@@ -437,6 +437,8 @@ DSP_DEVICE_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/dsp/device ...@@ -437,6 +437,8 @@ DSP_DEVICE_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/dsp/device
ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwd/dwd_device_ids_inc_daily" ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwd/dwd_device_ids_inc_daily"
ADS_DEVICE_MID_ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/ads/ads_device_mid_id_mapping"
ADS_DEVICE_ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/ads/ads_device_id_mapping" ADS_DEVICE_ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/ads/ads_device_id_mapping"
JAR=./DMP.jar JAR=./DMP.jar
......
...@@ -19,8 +19,8 @@ spark-submit --class mobvista.dmp.datasource.id_mapping.DspReq \ ...@@ -19,8 +19,8 @@ spark-submit --class mobvista.dmp.datasource.id_mapping.DspReq \
--name "EtlDeviceIdDaily.$BUSINESS.$LOG_TIME" \ --name "EtlDeviceIdDaily.$BUSINESS.$LOG_TIME" \
--conf spark.yarn.executor.memoryOverhead=2048 \ --conf spark.yarn.executor.memoryOverhead=2048 \
--conf spark.network.timeout=720s \ --conf spark.network.timeout=720s \
--conf spark.sql.shuffle.partitions=10000 \ --conf spark.sql.shuffle.partitions=20000 \
--conf spark.default.parallelism=10000 \ --conf spark.default.parallelism=20000 \
--master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 8g --executor-cores 5 --num-executors 200 \ --master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 8g --executor-cores 5 --num-executors 200 \
../${JAR} -date ${LOG_TIME} -business ${BUSINESS} -output ${OUTPUT_PATH} -coalesce 2000 ../${JAR} -date ${LOG_TIME} -business ${BUSINESS} -output ${OUTPUT_PATH} -coalesce 2000
......
type=command type=command
command=sh -x id_mapping.sh dependencies=id_mapping_overseas_android,id_mapping_cn_android
\ No newline at end of file command=echo "id_mapping job end!"
\ No newline at end of file
...@@ -2,7 +2,11 @@ ...@@ -2,7 +2,11 @@
source ../dmp_env.sh source ../dmp_env.sh
LOG_TIME=$(date +%Y-%m-%d -d "-1 day $ScheduleTime") COUNTRY=$1
PLATFORM=$2
LOG_TIME=$(date +%Y%m%d -d "-1 day $ScheduleTime")
date_path=$(date +'%Y/%m/%d' -d "-1 day $ScheduleTime") date_path=$(date +'%Y/%m/%d' -d "-1 day $ScheduleTime")
...@@ -10,28 +14,22 @@ ADN_REQUEST_INPUT_PATH=${ID_MAPPING}/${date_path}/adn_request ...@@ -10,28 +14,22 @@ ADN_REQUEST_INPUT_PATH=${ID_MAPPING}/${date_path}/adn_request
DSP_INPUT_PATH=${ID_MAPPING}/${date_path}/dsp_req DSP_INPUT_PATH=${ID_MAPPING}/${date_path}/dsp_req
check_await "${ADN_REQUEST_INPUT_PATH}/_SUCCESS" # check_await "${ADN_REQUEST_INPUT_PATH}/$PLATFORM/_SUCCESS"
check_await "${DSP_INPUT_PATH}/_SUCCESS"
OUTPUT_PATH=${ADS_DEVICE_MID_ID_MAPPING}/${date_path}
RESULT_OUTPUT_PATH=${ADS_DEVICE_ID_MAPPING}/${date_path}
country="US" # check_await "${DSP_INPUT_PATH}/$PLATFORM/_SUCCESS"
platform="ios" OUTPUT_PATH=${ADS_DEVICE_ID_MAPPING}/${date_path}/${COUNTRY}/${PLATFORM}
spark-submit --class mobvista.dmp.datasource.id_mapping.IDMappingGraphx \ spark-submit --class mobvista.dmp.datasource.id_mapping.IDMappingGraphx \
--name "IDMappingGraphx.${LOG_TIME}.${country}.${platform}" \ --name "IDMappingGraphx.${LOG_TIME}.${COUNTRY}.${PLATFORM}" \
--conf spark.yarn.executor.memoryOverhead=2048 \ --conf spark.yarn.executor.memoryOverhead=2048 \
--conf spark.network.timeout=720s \ --conf spark.network.timeout=720s \
--conf spark.sql.shuffle.partitions=10000 \ --conf spark.sql.shuffle.partitions=10000 \
--conf spark.default.parallelism=10000 \ --conf spark.default.parallelism=10000 \
--conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \ --master yarn --deploy-mode cluster --executor-memory 10g --driver-memory 6g --executor-cores 5 --num-executors 200 \
--master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 8g --executor-cores 5 --num-executors 100 \ ../${JAR} -date ${LOG_TIME} -country ${COUNTRY} -platform ${PLATFORM} \
../${JAR} -date ${LOG_TIME} -country ${country} -platform ${platform} -output ${OUTPUT_PATH} -result_output ${RESULT_OUTPUT_PATH} -coalesce 500 -output ${OUTPUT_PATH}/mid -fre_output ${OUTPUT_PATH}/frequency -result_output ${OUTPUT_PATH}/result -coalesce 1000
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
exit 255 exit 255
fi fi
\ No newline at end of file
type=command
dependencies=id_mapping_cn_ios
command=sh -x id_mapping.sh 'cn' 'android'
\ No newline at end of file
type=command
command=sh -x id_mapping.sh 'cn' 'ios'
\ No newline at end of file
type=command
dependencies=id_mapping_overseas_ios
command=sh -x id_mapping.sh 'overseas' 'android'
\ No newline at end of file
type=command
command=sh -x id_mapping.sh 'overseas' 'ios'
\ No newline at end of file
...@@ -32,7 +32,6 @@ unmount_output_path="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/dm_us ...@@ -32,7 +32,6 @@ unmount_output_path="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/dm_us
spark-submit --class mobvista.dmp.datasource.retargeting.DeviceInfoJob \ spark-submit --class mobvista.dmp.datasource.retargeting.DeviceInfoJob \
--name "DeviceInfoJob.wangjf.${date}" \ --name "DeviceInfoJob.wangjf.${date}" \
--conf spark.sql.broadcastTimeout=1200 \
--conf spark.sql.shuffle.partitions=10000 \ --conf spark.sql.shuffle.partitions=10000 \
--conf spark.default.parallelism=10000 \ --conf spark.default.parallelism=10000 \
--conf spark.kryoserializer.buffer.max=512m \ --conf spark.kryoserializer.buffer.max=512m \
...@@ -40,7 +39,7 @@ spark-submit --class mobvista.dmp.datasource.retargeting.DeviceInfoJob \ ...@@ -40,7 +39,7 @@ spark-submit --class mobvista.dmp.datasource.retargeting.DeviceInfoJob \
--conf spark.sql.files.maxPartitionBytes=536870912 \ --conf spark.sql.files.maxPartitionBytes=536870912 \
--conf spark.sql.adaptive.enabled=true \ --conf spark.sql.adaptive.enabled=true \
--conf spark.sql.adaptive.advisoryPartitionSizeInBytes=536870912 \ --conf spark.sql.adaptive.advisoryPartitionSizeInBytes=536870912 \
--master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 10g --executor-cores 4 --num-executors 100 \ --master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 10g --executor-cores 4 --num-executors 150 \
../${JAR} \ ../${JAR} \
-date ${date} -output ${output_path} -coalesce 3000 -date ${date} -output ${output_path} -coalesce 3000
......
...@@ -59,7 +59,7 @@ object Constant { ...@@ -59,7 +59,7 @@ object Constant {
""" """
|SELECT b.device_id, UPPER(country) country, CAST(b.offer_id AS string) offer_id, COALESCE(a.id, b.event_name) id, COALESCE(a.event_name, b.event_name) event_name, COALESCE(a.event_type,'') event_type FROM |SELECT b.device_id, UPPER(country) country, CAST(b.offer_id AS string) offer_id, COALESCE(a.id, b.event_name) id, COALESCE(a.event_name, b.event_name) event_name, COALESCE(a.event_type,'') event_type FROM
| (SELECT devid device_id, MAX(country) country, event_name, uuid offer_id FROM dwh.ods_3s_trackingcsv_event_info | (SELECT devid device_id, MAX(country) country, event_name, uuid offer_id FROM dwh.ods_3s_trackingcsv_event_info
| WHERE yyyy = '@year' and mm = '@month' and dd = '@day' AND devid IS NOT NULL AND devid <> '' GROUP BY devid, event_name, uuid) b | WHERE yyyymmdd = '@date' AND devid IS NOT NULL AND devid <> '' GROUP BY devid, event_name, uuid) b
| LEFT JOIN | LEFT JOIN
| (SELECT CAST(id AS string) id, event_name, event_type, offer_id FROM dwh.ods_3s_trackingcsv_event_define WHERE yyyymmdd = '@date') a | (SELECT CAST(id AS string) id, event_name, event_type, offer_id FROM dwh.ods_3s_trackingcsv_event_define WHERE yyyymmdd = '@date') a
| ON a.offer_id = b.offer_id | ON a.offer_id = b.offer_id
......
...@@ -75,13 +75,7 @@ class TrackingEventDaily extends CommonSparkJob with java.io.Serializable { ...@@ -75,13 +75,7 @@ class TrackingEventDaily extends CommonSparkJob with java.io.Serializable {
FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true) FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)
val year = date.substring(0, 4) var sql = Constant.tracking_event_sql.replace("@date", date)
val month = date.substring(4, 6)
val day = date.substring(6, 8)
var sql = Constant.tracking_event_sql.replace("@year", year)
.replace("@month", month)
.replace("@day", day)
spark.sql(sql) spark.sql(sql)
.filter(r => { .filter(r => {
......
...@@ -115,7 +115,7 @@ object Constant { ...@@ -115,7 +115,7 @@ object Constant {
StructField("xwho", StringType), StructField("xwho", StringType),
StructField("user_id", StringType), StructField("user_id", StringType),
StructField("bkupid", StringType), StructField("bkupid", StringType),
StructField("cnt", IntegerType) StructField("cnt", LongType)
)) ))
val androidCNIDSet = Array("imei", "oaid", "gaid", "sysid", "xwho", "user_id", "android_pkg", "bmosv_upt", "bmosv_ipua_pkg", "bkupid") val androidCNIDSet = Array("imei", "oaid", "gaid", "sysid", "xwho", "user_id", "android_pkg", "bmosv_upt", "bmosv_ipua_pkg", "bkupid")
...@@ -223,7 +223,7 @@ object Constant { ...@@ -223,7 +223,7 @@ object Constant {
val ios_id_mapping_sql: String = val ios_id_mapping_sql: String =
""" """
|SELECT idfa, idfv, pkg_name, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt, count(1) cnt |SELECT idfa, idfv, pkg_name, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt, count(1) cnt
| FROM dwd.dwd_device_ios_ids_inc_daily WHERE dt = '@date' | FROM dwd.dwd_device_ios_ids_inc_daily WHERE dt = '@date' @filter_country
| GROUP BY idfa, idfv, pkg_name, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt | GROUP BY idfa, idfv, pkg_name, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt
|""".stripMargin |""".stripMargin
...@@ -231,7 +231,7 @@ object Constant { ...@@ -231,7 +231,7 @@ object Constant {
""" """
|SELECT imei, android_id, pkg_name, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt, count(1) cnt |SELECT imei, android_id, pkg_name, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt, count(1) cnt
| FROM dwd.dwd_device_android_ids_inc_daily WHERE dt = '@date' @filter_country | FROM dwd.dwd_device_android_ids_inc_daily WHERE dt = '@date' @filter_country
| GROUP BY imei, android_id, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt | GROUP BY imei, android_id, pkg_name, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt
|""".stripMargin |""".stripMargin
val old_id_mapping_sql: String = val old_id_mapping_sql: String =
......
...@@ -22,13 +22,17 @@ class DspReq extends EtlDeviceIdDaily { ...@@ -22,13 +22,17 @@ class DspReq extends EtlDeviceIdDaily {
// ODS // ODS
val hour = i match { val hour = i match {
case 0 => case 0 =>
" AND hh BETWEEN '00' AND '05'" " AND hh BETWEEN '00' AND '03'"
case 1 => case 1 =>
" AND hh BETWEEN '06' AND '11'" " AND hh BETWEEN '04' AND '07'"
case 2 => case 2 =>
" AND hh BETWEEN '12' AND '17'" " AND hh BETWEEN '08' AND '11'"
case 3 => case 3 =>
" AND hh BETWEEN '18' AND '23'" " AND hh BETWEEN '12' AND '15'"
case 4 =>
" AND hh BETWEEN '16' AND '19'"
case 5 =>
" AND hh BETWEEN '20' AND '23'"
case _ => case _ =>
"" ""
} }
......
...@@ -40,10 +40,9 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable { ...@@ -40,10 +40,9 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
try { try {
if ("dsp_req".equalsIgnoreCase(business)) { if ("dsp_req".equalsIgnoreCase(business)) {
for (i <- 0 until 4) { for (i <- 0 until 6) {
val df = processData(date, i, spark) val df = processData(date, i, spark)
.repartition(5000) df.persist(StorageLevel.MEMORY_AND_DISK_SER)
.persist(StorageLevel.MEMORY_AND_DISK_SER)
val iosTab = df.filter(plf => { val iosTab = df.filter(plf => {
"ios".equals(plf._1) "ios".equals(plf._1)
...@@ -53,7 +52,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable { ...@@ -53,7 +52,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/ios/${i}"), true) FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/ios/${i}"), true)
spark.createDataFrame(iosTab, iosSchema) spark.createDataFrame(iosTab, iosSchema)
.coalesce(coalesce) .repartition(coalesce)
.write.mode(SaveMode.Overwrite) .write.mode(SaveMode.Overwrite)
.option("orc.compress", "zlib") .option("orc.compress", "zlib")
.orc(output + s"/ios/${i}") .orc(output + s"/ios/${i}")
...@@ -67,7 +66,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable { ...@@ -67,7 +66,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/android/${i}"), true) FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/android/${i}"), true)
spark.createDataFrame(adrTab, adrSchema) spark.createDataFrame(adrTab, adrSchema)
.coalesce(coalesce) .repartition(coalesce)
.write.mode(SaveMode.Overwrite) .write.mode(SaveMode.Overwrite)
.option("orc.compress", "zlib") .option("orc.compress", "zlib")
.orc(output + s"/android/${i}") .orc(output + s"/android/${i}")
...@@ -81,7 +80,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable { ...@@ -81,7 +80,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/other/${i}"), true) FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/other/${i}"), true)
spark.createDataFrame(otherTab, otherSchema) spark.createDataFrame(otherTab, otherSchema)
.coalesce(coalesce) .repartition(coalesce)
.write.mode(SaveMode.Overwrite) .write.mode(SaveMode.Overwrite)
.option("orc.compress", "zlib") .option("orc.compress", "zlib")
.orc(output + s"/other/${i}") .orc(output + s"/other/${i}")
...@@ -90,8 +89,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable { ...@@ -90,8 +89,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
} }
} else { } else {
val df = processData(date, 0, spark) val df = processData(date, 0, spark)
.repartition(5000) df.persist(StorageLevel.MEMORY_AND_DISK_SER)
.persist(StorageLevel.MEMORY_AND_DISK_SER)
val iosTab = df.filter(plf => { val iosTab = df.filter(plf => {
"ios".equals(plf._1) "ios".equals(plf._1)
......
...@@ -57,7 +57,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable { ...@@ -57,7 +57,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
.config("spark.sql.orc.filterPushdown", "true") .config("spark.sql.orc.filterPushdown", "true")
.config("spark.io.compression.codec", "lz4") .config("spark.io.compression.codec", "lz4")
.config("spark.io.compression.lz4.blockSize", "64k") .config("spark.io.compression.lz4.blockSize", "64k")
.config("spark.sql.autoBroadcastJoinThreshold", "314572800") .config("spark.sql.autoBroadcastJoinThreshold", "-1")
.config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse") .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.enableHiveSupport() .enableHiveSupport()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment