#!/bin/bash

# # # # # # # # # # # # # # # # # # # # # #
# @file  : collect_package_name.sh
# @author: houying
# @date  : 16-11-14
# @desc  : 收集需要抓取的package列表
# # # # # # # # # # # # # # # # # # # # # #

source ../dmp_env.sh

BASE_PATH="$(
  cd "$(dirname $0)"
  pwd
)"

LOG_DATE=$(date -d "$ScheduleTime 1 days ago" "+%Y%m%d")

year=${LOG_DATE:0:4}
month=${LOG_DATE:4:2}
day=${LOG_DATE:6:2}

YESTERDAY=$(date -d "$ScheduleTime 2 days ago" "+%Y%m%d")
yes_year=${YESTERDAY:0:4}
yes_month=${YESTERDAY:4:2}
yes_day=${YESTERDAY:6:2}

PACKAGE_PATH="${PACKAGE_TMP_PATH}/${year}/${month}/${day}"
PACKAGE_INSTALL_PATH="${INSTALL_PACKAGE_TMP_PATH}"

# $1 output file
campaign() {
  campaign=$1
  check_await "$DIM_ADN_CAMPAIGN/$year/$month/$day/_SUCCESS"
  local UPDATE="$(date -d "$LOG_DATE 4 days ago" "+%Y-%m-%d")"
  local SQL="
  select package_name, platform
  from dim_adn_campaign
  where year='$year'
    and month='$month'
    and day='$day'
    and update_time != ''
    and update_time>='$UPDATE'
  group by package_name, platform
  "
  hive_cmd "use dwh;$SQL;" >${campaign}
  hadoop fs -put ${campaign} "$PACKAGE_PATH"
}

: <<!
install() {
    check_await "$DM_INSTALL_LIST/$yes_year/$yes_month/$yes_day"
    hive_cmd "
    use dwh;
    select b.package_name, b.platform
    from (
       select t.package_name
       from dev.dm_package_black_list t
       where t.dt='${yes_year}${yes_month}${yes_day}'
    ) a
    right outer join (
      select package_name, platform
      from dm_install_list_v2
      where dt='${yes_year}${yes_month}${yes_day}'
      group by package_name, platform
    ) b on a.package_name=b.package_name
    where a.package_name is null
    ;
    " > "$1"
}
!

install() {
  hadoop fs -rmr ${PACKAGE_INSTALL_PATH}

  DMP_INSTALL_LIST_PATH="${DMP_INSTALL_LIST}/${yes_year}/${yes_month}/${yes_day}/14days"

  check_await ${DMP_INSTALL_LIST_PATH}/_SUCCESS

  spark-submit --class mobvista.dmp.datasource.apptag.CrawPkgsSpark \
    --conf spark.default.parallelism=2000 \
    --conf spark.sql.shuffle.partitions=2000 \
    --conf spark.sql.autoBroadcastJoinThreshold=31457280 \
    --conf spark.kryoserializer.buffer.max=512m \
    --conf spark.driver.maxResultSize=4g \
    --master yarn --deploy-mode cluster --name CrawPkgsSpark --executor-memory 8g --driver-memory 4g --executor-cores 4 --num-executors 100 \
    ../${JAR} -pkginstallpath ${PACKAGE_INSTALL_PATH} -coalesce 20 \
    -yesday ${YESTERDAY}

  if [ $? -ne 0 ]; then
    exit 255
  fi

  install=$1

  if [[ $? -ne 0 ]]; then
    exit 255
  fi
  hadoop fs -text ${PACKAGE_INSTALL_PATH}/* >${install}
  if [[ $? -ne 0 ]]; then
    exit 255
  fi
  hadoop fs -put ${install} "$PACKAGE_PATH"
  if [[ $? -ne 0 ]]; then
    exit 255
  fi
}

bundle_pkg_mapping() {
  path="$BUNDLE_PACKAGE_MAPPING_PATH/$year/$month/$day/"
  check_await "$path/_SUCCESS"
  hadoop fs -text $path/* | awk -F '\t' '{print $2"\tios"}' >bundle.data
  hadoop fs -rm $PACKAGE_PATH/bundle.data
  hadoop fs -put bundle.data $PACKAGE_PATH
}

hadoop fs -test -e ${PACKAGE_PATH}
if [ $? -ne 0 ]; then
  hadoop fs -mkdir -p ${PACKAGE_PATH}
fi

# 循环执行以上函数收集pkg_name
hadoop fs -rm ${PACKAGE_PATH}/*

for cmd in install campaign; do
  ${cmd} "$cmd.txt"
  if [ $? -ne 0 ]; then
    exit 255
  fi
done

bundle_pkg_mapping
