#!/bin/bash

######################################################################
# autor: fengliang
# date:  20170518
# desc:  计算年龄标签数据各个年龄段儿的样本量和算法推荐量
######################################################################

source ../prd_env.sh

echo "ScheduleTime=$ScheduleTime"
date=$(date +%Y%m%d -d "-1 days $ScheduleTime")

year=${date:0:4}
month=${date:4:2}
day=${date:6:2}
date_path=${year}/${month}/${day}

DSP_AGE_PATH="`get_recently_dir ${AGE_GET_DSP_PATH}/ $date ''`"
GA_AGE_PATH="`get_recently_dir ${AGE_GET_GA_PATH}/ $date ''`"
TOTAL_AGE_PATH="`get_recently_dir ${AGE_CALC_DEVICE}/ $date ''`"
SAMPLE_OUTPUT_PATH="${REPORT_SAMPLE_AGE_RESULT_PATH}/$date_path"
SAMPLE_COUNTRY_OUTPUT_PATH="${REPORT_SAMPLE_AGE_COUNTRY_RESULT_PATH}/$date_path"
OUTPUT_PATH="${REPORT_AGE_TAG_RESULT_PATH}/$date_path"
COUNTRY_OUTPUT_PATH="${REPORT_AGE_TAG_COUNTRY_RESULT_PATH}/$date_path"


data_dir="../tmp"
sample_data_file="$data_dir/age_sample_${date}.data"
sample_cnt_data_file="$data_dir/age_sample_cnt_${date}.data"
total_data_file="$data_dir/age_total_${date}.data"
total_cnt_data_file="$data_dir/age_total_cnt_${date}.data"
recommend_data_file="$data_dir/age_recommend_${date}.data"
recommend_cnt_data_file="$data_dir/age_recommend_cnt_${date}.data"
load_data_file="$data_dir/age_load_${data}.load"
if [ ! -d $data_dir ];then
  mkdir -p $data_dir
fi


hql="
use dwh;
add jar ../${JAR};
create temporary function getAgeRange as 'mobvista.prd.datasource.udf.GetAgeRange';

set hive.cli.print.header=false;
set mapreduce.task.io.sort.mb=512;
set mapreduce.reduce.shuffle.parallelcopies=50;

set mapreduce.job.name=dmp_age_tag_fengliang;

drop table tmp_dsp_age_${date};
drop table tmp_ga_age_${date};
drop table tmp_merge_age_${date};

create external table tmp_dsp_age_${date}(
device_id string,
tag string,
agerange string,
type string,
device_type string
)
row format delimited
FIELDS TERMINATED BY '\t'
location '$DSP_AGE_PATH';


create external table tmp_ga_age_${date}(
device_id string,
tag string,
agerange string,
type string,
device_type string
)
row format delimited
FIELDS TERMINATED BY '\t'
location '$GA_AGE_PATH';

create table tmp_merge_age_${date} as
select t.device_id, t.device_type, t.agerange
from (
  select t.device_id, t.device_type, t.agerange,
    row_number() over(PARTITION BY t.device_id, t.device_type ) rk
  from (
    select t.device_id, t.device_type, t.agerange
    from tmp_ga_age_${date} t
    union all
    select t.device_id, t.device_type, t.agerange
    from tmp_dsp_age_${date} t
  ) t
) t
where t.rk=1;

set mapreduce.reduce.memory.mb=1536;

--- 整体
insert overwrite directory '$SAMPLE_OUTPUT_PATH'
select
case when agerange='1' then '0-17'
when agerange='2' then '18-24'
when agerange='3' then '25-44'
when agerange='4' then '45-59'
when agerange='5' then '60+'
else 'other' end as age, count(1) as cnt
from tmp_merge_age_${date} t
group by case when agerange='1' then '0-17'
when agerange='2' then '18-24'
when agerange='3' then '25-44'
when agerange='4' then '45-59'
when agerange='5' then '60+'
else 'other' end;

set mapreduce.reduce.memory.mb=2048;

-- 分国家样本量
insert overwrite directory '${SAMPLE_COUNTRY_OUTPUT_PATH}'
select t.country, case when agerange='1' then '0-17'
when agerange='2' then '18-24'
when agerange='3' then '25-44'
when agerange='4' then '45-59'
when agerange='5' then '60+'
else 'other' end as age, count(1) as cnt
from (
  select a.device_id, a.device_type, a.agerange,
  case when b.country is null or b.country ='' then 'OTHER'
else b.country end as country
  from tmp_merge_age_${date} a
  left outer join (
    select *
    from ods_dmp_device_total t
    where t.dt='${date}'
    and t.country in (${REPORT_COUNTRIES})
  ) b on a.device_id=b.device_id and a.device_type=b.device_type
) t
group by t.country, case when agerange='1' then '0-17'
when agerange='2' then '18-24'
when agerange='3' then '25-44'
when agerange='4' then '45-59'
when agerange='5' then '60+'
else 'other' end;


insert overwrite directory '${COUNTRY_OUTPUT_PATH}'
select t.country, agerange as age, count(1) as cnt
from (
  select a.device_id, a.device_type, a.agerange,
    case when b.country is null or b.country ='' then 'OTHER'
    else b.country end as country
  from
  (
    select getAgeRange(t.age) as agerange, t.device_id, t.device_type
    from dm_device_age t
    where t.year='${year}' and t.month='${month}' and day='${day}'
  ) a
  left outer join
  (
    select *
    from ods_dmp_device_total t
    where t.dt='${date}'
    and t.country in (${REPORT_COUNTRIES})
  ) b on a.device_id=b.device_id and a.device_type=b.device_type
) t
group by t.country, agerange;

drop table tmp_dsp_age_${date};
drop table tmp_ga_age_${date};
drop table tmp_merge_age_${date};
"

echo -e "sql :\n $hql"
$offline_hive -e "$hql"
if [ $? -ne 0 ];then
  exit 255
fi

# 下载整体样本量数据
hadoop fs -getmerge $SAMPLE_OUTPUT_PATH $sample_data_file
if [ $? -ne 0 ];then
  exit 255
fi

# 下载分国家样本量数据
hadoop fs -getmerge $SAMPLE_COUNTRY_OUTPUT_PATH $sample_cnt_data_file
if [ $? -ne 0 ];then
  exit 255
fi

hadoop fs -rm -r $OUTPUT_PATH

# calc total age range count
hadoop jar ../${JAR} mobvista.prd.datasource.tag.mapreduce.AgeTotalJob -input $TOTAL_AGE_PATH \
  -Dmapreduce.fileoutputcommitter.algorithm.version=2 \
  -output $OUTPUT_PATH -reduceNum 20
if [ $? -ne 0 ];then
  exit 255
fi

# 下载整体数据
hadoop fs -getmerge $OUTPUT_PATH $total_data_file
if [ $? -ne 0 ];then
  exit 255
fi

# 下载分国家整体数据
hadoop fs -getmerge $COUNTRY_OUTPUT_PATH $total_cnt_data_file
if [ $? -ne 0 ];then
  exit 255
fi

perl -pi -e 's|\001|\t|g' $sample_data_file
perl -pi -e 's|\001|\t|g' $sample_cnt_data_file
perl -pi -e 's|\001|\t|g' $total_cnt_data_file

awk -F '\t' 'NR==FNR{s[$1]=$2}NR!=FNR && $1 in s {print $0,s[$1]}' $sample_data_file $total_data_file \
 | awk '{print $1"\t"($2-$3)}' > $recommend_data_file
if [ $? -ne 0 ];then
  exit 255
fi

awk -F '\t' 'NR==FNR{s[$1$2]=$3}NR!=FNR && $1$2 in s {print $0,s[$1$2]}' $sample_cnt_data_file $total_cnt_data_file \
 | awk '{print $1"\t"$2"\t"($3-$4)}' > $recommend_cnt_data_file
if [ $? -ne 0 ];then
  exit 255
fi

> $load_data_file
cat $sample_data_file | awk -F '\t' -v date=${date} '{print date"\tDMP标签数据\t年龄\t整体\t"$1"\t-\t-\t样本量\t"$2}' >> $load_data_file
cat $recommend_data_file |  awk -F '\t' -v date=${date} '{print date"\tDMP标签数据\t年龄\t整体\t"$1"\t-\t-\t算法推算量\t"$2}' >> $load_data_file
cat $sample_cnt_data_file | awk -F '\t' -v date=${date} '{print date"\tDMP标签数据\t年龄\t分国家\t"$1"\t"$2"\t-\t样本量\t"$3}' >> $load_data_file
cat $recommend_cnt_data_file | awk -F '\t' -v date=${date} '{print date"\tDMP标签数据\t年龄\t分国家\t"$1"\t"$2"\t-\t算法推算量\t"$3}' >> $load_data_file



del_sql="
  DELETE FROM dmp_report_reuslt
  where day_key='${date}' and dimension_type1='DMP标签数据'
  and dimension_type2='年龄' AND video_desc in ('样本量', '算法推荐量');"

load_sql="$del_sql;load data local infile '${load_data_file}' into table dmp_report_reuslt;"
$MYSQL_ETL "$load_sql"
if [ $? -ne 0 ];then
  exit 255
fi

rm $sample_data_file $total_data_file $recommend_data_file $load_data_file \
$sample_cnt_data_file $total_cnt_data_file $recommend_cnt_data_file

exit 0