Commit a3f58be6 by WangJinfeng

init id_mapping

parent b1f36887
......@@ -437,6 +437,8 @@ DSP_DEVICE_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/dsp/device
ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwd/dwd_device_ids_inc_daily"
ADS_DEVICE_MID_ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/ads/ads_device_mid_id_mapping"
ADS_DEVICE_ID_MAPPING="s3://mob-emr-test/dataplatform/DataWareHouse/data/ads/ads_device_id_mapping"
JAR=./DMP.jar
......
......@@ -19,8 +19,8 @@ spark-submit --class mobvista.dmp.datasource.id_mapping.DspReq \
--name "EtlDeviceIdDaily.$BUSINESS.$LOG_TIME" \
--conf spark.yarn.executor.memoryOverhead=2048 \
--conf spark.network.timeout=720s \
--conf spark.sql.shuffle.partitions=10000 \
--conf spark.default.parallelism=10000 \
--conf spark.sql.shuffle.partitions=20000 \
--conf spark.default.parallelism=20000 \
--master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 8g --executor-cores 5 --num-executors 200 \
../${JAR} -date ${LOG_TIME} -business ${BUSINESS} -output ${OUTPUT_PATH} -coalesce 2000
......
type=command
command=sh -x id_mapping.sh
\ No newline at end of file
dependencies=id_mapping_overseas_android,id_mapping_cn_android
command=echo "id_mapping job end!"
\ No newline at end of file
......@@ -2,7 +2,11 @@
source ../dmp_env.sh
LOG_TIME=$(date +%Y-%m-%d -d "-1 day $ScheduleTime")
COUNTRY=$1
PLATFORM=$2
LOG_TIME=$(date +%Y%m%d -d "-1 day $ScheduleTime")
date_path=$(date +'%Y/%m/%d' -d "-1 day $ScheduleTime")
......@@ -10,28 +14,22 @@ ADN_REQUEST_INPUT_PATH=${ID_MAPPING}/${date_path}/adn_request
DSP_INPUT_PATH=${ID_MAPPING}/${date_path}/dsp_req
check_await "${ADN_REQUEST_INPUT_PATH}/_SUCCESS"
check_await "${DSP_INPUT_PATH}/_SUCCESS"
OUTPUT_PATH=${ADS_DEVICE_MID_ID_MAPPING}/${date_path}
RESULT_OUTPUT_PATH=${ADS_DEVICE_ID_MAPPING}/${date_path}
# check_await "${ADN_REQUEST_INPUT_PATH}/$PLATFORM/_SUCCESS"
country="US"
# check_await "${DSP_INPUT_PATH}/$PLATFORM/_SUCCESS"
platform="ios"
OUTPUT_PATH=${ADS_DEVICE_ID_MAPPING}/${date_path}/${COUNTRY}/${PLATFORM}
spark-submit --class mobvista.dmp.datasource.id_mapping.IDMappingGraphx \
--name "IDMappingGraphx.${LOG_TIME}.${country}.${platform}" \
--name "IDMappingGraphx.${LOG_TIME}.${COUNTRY}.${PLATFORM}" \
--conf spark.yarn.executor.memoryOverhead=2048 \
--conf spark.network.timeout=720s \
--conf spark.sql.shuffle.partitions=10000 \
--conf spark.default.parallelism=10000 \
--conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
--master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 8g --executor-cores 5 --num-executors 100 \
../${JAR} -date ${LOG_TIME} -country ${country} -platform ${platform} -output ${OUTPUT_PATH} -result_output ${RESULT_OUTPUT_PATH} -coalesce 500
--master yarn --deploy-mode cluster --executor-memory 10g --driver-memory 6g --executor-cores 5 --num-executors 200 \
../${JAR} -date ${LOG_TIME} -country ${COUNTRY} -platform ${PLATFORM} \
-output ${OUTPUT_PATH}/mid -fre_output ${OUTPUT_PATH}/frequency -result_output ${OUTPUT_PATH}/result -coalesce 1000
if [ $? -ne 0 ]; then
exit 255
fi
\ No newline at end of file
fi
type=command
dependencies=id_mapping_cn_ios
command=sh -x id_mapping.sh 'cn' 'android'
\ No newline at end of file
type=command
command=sh -x id_mapping.sh 'cn' 'ios'
\ No newline at end of file
type=command
dependencies=id_mapping_overseas_ios
command=sh -x id_mapping.sh 'overseas' 'android'
\ No newline at end of file
type=command
command=sh -x id_mapping.sh 'overseas' 'ios'
\ No newline at end of file
......@@ -32,7 +32,6 @@ unmount_output_path="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/dm_us
spark-submit --class mobvista.dmp.datasource.retargeting.DeviceInfoJob \
--name "DeviceInfoJob.wangjf.${date}" \
--conf spark.sql.broadcastTimeout=1200 \
--conf spark.sql.shuffle.partitions=10000 \
--conf spark.default.parallelism=10000 \
--conf spark.kryoserializer.buffer.max=512m \
......@@ -40,7 +39,7 @@ spark-submit --class mobvista.dmp.datasource.retargeting.DeviceInfoJob \
--conf spark.sql.files.maxPartitionBytes=536870912 \
--conf spark.sql.adaptive.enabled=true \
--conf spark.sql.adaptive.advisoryPartitionSizeInBytes=536870912 \
--master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 10g --executor-cores 4 --num-executors 100 \
--master yarn --deploy-mode cluster --executor-memory 12g --driver-memory 10g --executor-cores 4 --num-executors 150 \
../${JAR} \
-date ${date} -output ${output_path} -coalesce 3000
......
......@@ -59,7 +59,7 @@ object Constant {
"""
|SELECT b.device_id, UPPER(country) country, CAST(b.offer_id AS string) offer_id, COALESCE(a.id, b.event_name) id, COALESCE(a.event_name, b.event_name) event_name, COALESCE(a.event_type,'') event_type FROM
| (SELECT devid device_id, MAX(country) country, event_name, uuid offer_id FROM dwh.ods_3s_trackingcsv_event_info
| WHERE yyyy = '@year' and mm = '@month' and dd = '@day' AND devid IS NOT NULL AND devid <> '' GROUP BY devid, event_name, uuid) b
| WHERE yyyymmdd = '@date' AND devid IS NOT NULL AND devid <> '' GROUP BY devid, event_name, uuid) b
| LEFT JOIN
| (SELECT CAST(id AS string) id, event_name, event_type, offer_id FROM dwh.ods_3s_trackingcsv_event_define WHERE yyyymmdd = '@date') a
| ON a.offer_id = b.offer_id
......
......@@ -75,13 +75,7 @@ class TrackingEventDaily extends CommonSparkJob with java.io.Serializable {
FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)
val year = date.substring(0, 4)
val month = date.substring(4, 6)
val day = date.substring(6, 8)
var sql = Constant.tracking_event_sql.replace("@year", year)
.replace("@month", month)
.replace("@day", day)
var sql = Constant.tracking_event_sql.replace("@date", date)
spark.sql(sql)
.filter(r => {
......
......@@ -115,7 +115,7 @@ object Constant {
StructField("xwho", StringType),
StructField("user_id", StringType),
StructField("bkupid", StringType),
StructField("cnt", IntegerType)
StructField("cnt", LongType)
))
val androidCNIDSet = Array("imei", "oaid", "gaid", "sysid", "xwho", "user_id", "android_pkg", "bmosv_upt", "bmosv_ipua_pkg", "bkupid")
......@@ -223,7 +223,7 @@ object Constant {
val ios_id_mapping_sql: String =
"""
|SELECT idfa, idfv, pkg_name, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt, count(1) cnt
| FROM dwd.dwd_device_ios_ids_inc_daily WHERE dt = '@date'
| FROM dwd.dwd_device_ios_ids_inc_daily WHERE dt = '@date' @filter_country
| GROUP BY idfa, idfv, pkg_name, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt
|""".stripMargin
......@@ -231,7 +231,7 @@ object Constant {
"""
|SELECT imei, android_id, pkg_name, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt, count(1) cnt
| FROM dwd.dwd_device_android_ids_inc_daily WHERE dt = '@date' @filter_country
| GROUP BY imei, android_id, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt
| GROUP BY imei, android_id, pkg_name, oaid, gaid, sysid, bkupid, xwho, user_id, country, ip, ua, brand, model, os_version, osv_upt, upt
|""".stripMargin
val old_id_mapping_sql: String =
......
......@@ -22,13 +22,17 @@ class DspReq extends EtlDeviceIdDaily {
// ODS
val hour = i match {
case 0 =>
" AND hh BETWEEN '00' AND '05'"
" AND hh BETWEEN '00' AND '03'"
case 1 =>
" AND hh BETWEEN '06' AND '11'"
" AND hh BETWEEN '04' AND '07'"
case 2 =>
" AND hh BETWEEN '12' AND '17'"
" AND hh BETWEEN '08' AND '11'"
case 3 =>
" AND hh BETWEEN '18' AND '23'"
" AND hh BETWEEN '12' AND '15'"
case 4 =>
" AND hh BETWEEN '16' AND '19'"
case 5 =>
" AND hh BETWEEN '20' AND '23'"
case _ =>
""
}
......
......@@ -40,10 +40,9 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
try {
if ("dsp_req".equalsIgnoreCase(business)) {
for (i <- 0 until 4) {
for (i <- 0 until 6) {
val df = processData(date, i, spark)
.repartition(5000)
.persist(StorageLevel.MEMORY_AND_DISK_SER)
df.persist(StorageLevel.MEMORY_AND_DISK_SER)
val iosTab = df.filter(plf => {
"ios".equals(plf._1)
......@@ -53,7 +52,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/ios/${i}"), true)
spark.createDataFrame(iosTab, iosSchema)
.coalesce(coalesce)
.repartition(coalesce)
.write.mode(SaveMode.Overwrite)
.option("orc.compress", "zlib")
.orc(output + s"/ios/${i}")
......@@ -67,7 +66,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/android/${i}"), true)
spark.createDataFrame(adrTab, adrSchema)
.coalesce(coalesce)
.repartition(coalesce)
.write.mode(SaveMode.Overwrite)
.option("orc.compress", "zlib")
.orc(output + s"/android/${i}")
......@@ -81,7 +80,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output + s"/other/${i}"), true)
spark.createDataFrame(otherTab, otherSchema)
.coalesce(coalesce)
.repartition(coalesce)
.write.mode(SaveMode.Overwrite)
.option("orc.compress", "zlib")
.orc(output + s"/other/${i}")
......@@ -90,8 +89,7 @@ abstract class EtlDeviceIdDaily extends CommonSparkJob with Serializable {
}
} else {
val df = processData(date, 0, spark)
.repartition(5000)
.persist(StorageLevel.MEMORY_AND_DISK_SER)
df.persist(StorageLevel.MEMORY_AND_DISK_SER)
val iosTab = df.filter(plf => {
"ios".equals(plf._1)
......
......@@ -57,7 +57,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
.config("spark.sql.orc.filterPushdown", "true")
.config("spark.io.compression.codec", "lz4")
.config("spark.io.compression.lz4.blockSize", "64k")
.config("spark.sql.autoBroadcastJoinThreshold", "314572800")
.config("spark.sql.autoBroadcastJoinThreshold", "-1")
.config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.enableHiveSupport()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment