1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash
# # # # # # # # # # # # # # # # # # # # # #
# @file : etl_ruid_mapping.sh
# @author : jinfeng.wang
# @time : 2020-05-22 16:06:07
# # # # # # # # # # # # # # # # # # # # # #
source ../dmp_env.sh
LOG_TIME=${ScheduleTime:-$1}
date=$(date +%Y%m%d -d "-1 day $LOG_TIME")
date_path=$(date +"%Y/%m/%d" -d "-1 day ${LOG_TIME}")
hours="00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23"
MAPPING_INPUT="${MAPPING_LOG_DIR}/${date_path}"
for hour in ${hours}; do
if ! $(hadoop fs -ls "${MAPPING_INPUT}/${hour}" > /dev/null 2>&1)
then
hadoop fs -mkdir "${MAPPING_INPUT}/${hour}"
fi
done
OUTPUT_PATH="${RUID_MAPPING}/$date_path/"
spark-submit --class mobvista.dmp.datasource.dm.EtlRuidMapping \
--name "EtlRuidMapping.${date}" \
--conf spark.sql.shuffle.partitions=1000 \
--conf spark.default.parallelism=1000 \
--conf spark.kryoserializer.buffer.max=512m \
--conf spark.kryoserializer.buffer=64m \
--conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
--conf spark.sql.adaptive.enabled=true \
--conf spark.sql.adaptive.advisoryPartitionSizeInBytes=536870912 \
--conf spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive=true \
--master yarn --deploy-mode cluster --executor-memory 10g --driver-memory 4g --executor-cores 5 --num-executors 100 \
../${JAR} \
-date ${date} -output $OUTPUT_PATH
if [[ $? -ne 0 ]]; then
exit 255
fi