#!/bin/bash # # # # # # # # # # # # # # # # # # # # # # # @file : install_list_eggplants.sh # @author: houying # @date : 17-3-14 # # # # # # # # # # # # # # # # # # # # # # source ../dmp_env.sh LOG_TIME=$(date -d "$ScheduleTime 1 days ago" "+%Y%m%d") year=${LOG_TIME:0:4} month=${LOG_TIME:4:2} day=${LOG_TIME:6:2} DEVICE_PATH="$TMP_EGGPLANTS_INPUT_PATH/$year/$month/$day" OUTPUT_PATH="$TMP_EGGPLANTS_OUTPUT_PATH/$year/$month/$day" PROGRESS_PATH="$TMP_EGGPLANTS_PROGRESS_PATH/$year/$month/$day" hadoop fs -rm -r "$OUTPUT_PATH" hadoop fs -mkdir -p "$OUTPUT_PATH" hadoop fs -mkdir -p "$PROGRESS_PATH" # $1 第几次 # $2 s3 file # 注释掉下面的任务 : <<'COMMENT' function http_catch_eggplants() { local num="$1" local file="$2" local local_file="result_$num.txt" echo "processing $file into $local_file ..." hadoop fs -get "$file" "s3_${num}.txt" echo "[start at $(date "+%Y-%m-%d %H:%M:%S")]" java -cp ../${JAR} mobvista.dmp.datasource.eggplants.HttpCatchEggplants "s3_$num.txt" "$local_file" "package.list" if [[ "$(cat ${local_file}|wc -l)" -ne "0" ]] then hadoop fs -rm "$OUTPUT_PATH/$local_file" hadoop fs -put "$local_file" "$OUTPUT_PATH" fi rm "s3_$num.txt" "$local_file" echo "[end at $(date "+%Y-%m-%d %H:%M:%S")]" } num=0 PROGRESS="$(hadoop fs -text "${PROGRESS_PATH}/*")" for file in $(hadoop fs -ls "$DEVICE_PATH" | awk -v'FS= +' '{print $6}') do if [[ ! -z ${PROGRESS} ]] && [[ ! -z "$(grep "$file" <<< "$PROGRESS")" ]] then num=$(($num+1)) continue fi http_catch_eggplants ${num} ${file} echo "$file" > "${num}_DONE" hadoop fs -put "${num}_DONE" "$PROGRESS_PATH" if [[ "$(date "+%Y%m%d")" != "$(date -d "$ScheduleTime" "+%Y%m%d")" ]] then break fi num=$(($num+1)) done COMMENT