etl_ruid_mapping.sh

#!/bin/bash

# # # # # # # # # # # # # # # # # # # # # #
# @file    : etl_ruid_mapping.sh
# @author  : jinfeng.wang
# @time    : 2020-05-22 16:06:07
# # # # # # # # # # # # # # # # # # # # # #

source ../dmp_env.sh

LOG_TIME=${ScheduleTime:-$1}
date=$(date +%Y%m%d -d "-1 day $LOG_TIME")
date_path=$(date +"%Y/%m/%d" -d "-1 day ${LOG_TIME}")

hours="00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23"

MAPPING_INPUT="${MAPPING_LOG_DIR}/${date_path}"

for hour in ${hours}; do
    if ! $(hadoop fs -ls "${MAPPING_INPUT}/${hour}" > /dev/null 2>&1)
    then
        hadoop fs -mkdir "${MAPPING_INPUT}/${hour}"
    fi
done

OUTPUT_PATH="${RUID_MAPPING}/$date_path/"

spark-submit --class mobvista.dmp.datasource.dm.EtlRuidMapping \
  --name "EtlRuidMapping.${date}" \
  --conf spark.sql.shuffle.partitions=1000 \
  --conf spark.default.parallelism=1000 \
  --conf spark.kryoserializer.buffer.max=512m \
  --conf spark.kryoserializer.buffer=64m \
  --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
  --conf spark.sql.adaptive.enabled=true \
  --conf spark.sql.adaptive.advisoryPartitionSizeInBytes=536870912 \
  --conf spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive=true \
  --master yarn --deploy-mode cluster --executor-memory 10g --driver-memory 4g --executor-cores 5 --num-executors 100 \
  ../${JAR} \
  -date ${date} -output $OUTPUT_PATH

if [[ $? -ne 0 ]]; then
  exit 255
fi