cainixihuan05.sh 1.15 KB
Newer Older
wang-jinfeng committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
#!/usr/bin/env bash

source ../../dmp_env.sh

dt_slash_today=$(date -d "$ScheduleTime" +"%Y/%m/%d")

inputPath="s3://mob-emr-test/fan.jiang/mytest/tmp/crawl_app_info/input/${dt_slash_today}/05"
OutPath="s3://mob-emr-test/fan.jiang/mytest/tmp/crawl_app_info/output/${dt_slash_today}/05"
hadoop fs -rm -r  "${OutPath}"

#休息十五分钟,以防止频繁爬取网页,ip被封
sleep 900

iosOutPath=${OutPath}/ios
adrOutPath=${OutPath}/adr
bundleOutPath=${OutPath}/bundle
#proxyPath="s3://mob-emr-test/liduo/dmp/proxy"

#爬取日本网页数据代理
proxyPath="s3://mob-emr-test/lujunhao/dmp/proxy/tokyo_proxy.conf"

cmd="
spark-submit \
	--class mobvista.dmp.datasource.apptag.CaiNiXiHuanCrawlerSpark \
	--conf spark.network.timeout=720s \
	--conf spark.sql.autoBroadcastJoinThreshold=31457280 \
WangJinfeng committed
27
	\
wang-jinfeng committed
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
	--deploy-mode cluster \
	--name cainixihuan_AppInfoCrawlerSpark \
	--executor-memory 4g \
	--driver-memory 1g \
	--executor-cores 1 \
	--num-executors 1 \
	../../${JAR}  \
	-input ${inputPath} \
	-iosoutput ${iosOutPath} \
	-adroutput ${adrOutPath} \
	-bundleoutput ${bundleOutPath} \
	-coalesce 1 \
    -proxy_path ${proxyPath}
"
${cmd}

if [[ $? -ne 0 ]];then
  exit 255
fi