#!/usr/bin/env bash source ../../dmp_env.sh dt_slash_today=$(date -d "$ScheduleTime" +"%Y/%m/%d") inputPath="s3://mob-emr-test/fan.jiang/mytest/tmp/crawl_app_info/input/${dt_slash_today}/05" OutPath="s3://mob-emr-test/fan.jiang/mytest/tmp/crawl_app_info/output/${dt_slash_today}/05" hadoop fs -rm -r "${OutPath}" #休息十五分钟,以防止频繁爬取网页,ip被封 sleep 900 iosOutPath=${OutPath}/ios adrOutPath=${OutPath}/adr bundleOutPath=${OutPath}/bundle #proxyPath="s3://mob-emr-test/liduo/dmp/proxy" #爬取日本网页数据代理 proxyPath="s3://mob-emr-test/lujunhao/dmp/proxy/tokyo_proxy.conf" cmd=" spark-submit \ --class mobvista.dmp.datasource.apptag.CaiNiXiHuanCrawlerSpark \ --conf spark.network.timeout=720s \ --conf spark.sql.autoBroadcastJoinThreshold=31457280 \ --files ${HIVE_SITE_PATH} \ --master yarn \ --deploy-mode cluster \ --name cainixihuan_AppInfoCrawlerSpark \ --executor-memory 4g \ --driver-memory 1g \ --executor-cores 1 \ --num-executors 1 \ ../../${JAR} \ -input ${inputPath} \ -iosoutput ${iosOutPath} \ -adroutput ${adrOutPath} \ -bundleoutput ${bundleOutPath} \ -coalesce 1 \ -proxy_path ${proxyPath} " ${cmd} if [[ $? -ne 0 ]];then exit 255 fi