Commit 6b3ba3b6 by WangJinfeng

fix tracking etl job

parent 361e9bab
......@@ -51,7 +51,7 @@ public class GetDevIdUtil {
if (StringUtils.isNotBlank(devId)) {
return MRUtils.JOINER.join(devId, deviceType);
} else {
return null;
return "";
}
}
......
......@@ -14,15 +14,31 @@ abstract class CommonSparkJob {
val ENCODING = "UTF-8"
val HTTPPREFIX = "http://test.com"
val DATA_SPLIT = "\t"
val didPtn = "^[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}$"
val imeiPtn = "^([0-9]{15,17})$"
val imeiMd5Ptn = "^([a-fA-F0-9]{32})$"
val andriodIdPtn = "^[a-zA-Z0-9]{16}$"
val oaidAnotherPtn = "^([a-fA-F0-9]{1,64})$"
val md5Ptn = """^([0-9a-zA-Z])\1{30,32}"""
// IDFA/GAID
val didPtn = "^[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}$"
// 全0
val allZero = "00000000-0000-0000-0000-000000000000"
// IMEI
val imeiPtn = "^([0-9]{14,17})$"
// 14~16位连续多位相同字符,非法IMEI过滤
val imeiPtnAll = """^([0-9])\1{14,16}"""
// androidId
val andriodIdPtn = "^[a-zA-Z0-9]{15,17}$"
// 连续多位相同字符,非法 androidId 过滤
val andriodIdAll = "^[a-zA-Z0-9]\1{15}$"
// MD5
val md5Ptn = "^([a-fA-F0-9]{32})$"
// 连续多位相同字符,非法 IMEI MD5 过滤
val umd5Ptn = """^([0-9A-Za-z])\1{29,31}"""
// OAID
val oaidPtb = """^[0-9A-Za-z-]{16,64}$"""
// IP
val ipPtn = """^(25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])){3}$"""
// Date
val datePtn = """^\d{4}-\d{2}-\d{2}"""
val options = buildOptions()
val commParser = new BasicParser
......
......@@ -80,7 +80,7 @@ object AdnConstant {
} else {
""
}
if (StringUtils.isBlank(ruid) || ruid.length < 16) {
if (StringUtils.isBlank(ruid) || ruid.length < 16 || ruid.length > 64) {
ruid = ""
}
ruid
......
......@@ -3,7 +3,7 @@ package mobvista.dmp.datasource.adn
import mobvista.dmp.common.{CommonSparkJob, MobvistaConstant}
import mobvista.dmp.util.MRUtils
import org.apache.commons.cli.{BasicParser, Options}
import org.apache.commons.lang.StringUtils
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
......@@ -43,17 +43,17 @@ class AdnOrgLogEtlHours extends CommonSparkJob with Serializable {
// 该小时下的数据,所以改用 dwh.ods_adn_trackingnew_request_tmp_hb_request ,因为他和表dwh.ods_adn_trackingnew_request挂载是同时进行的,执行脚本 https://gitlab.mobvista.com/fan.jiang/ods_adn_trackingnew_click_merge/blob/master/job/ods_adn_trackingnew_request_merge.sh
// azkaban链接 https://dataplatform.mobvista.com:8443/manager?project=ods_adn_trackingnew_click_merge&flow=ods_adn_trackingnew_merge#executions
val sql =
s"""
|SELECT date, time, created timestamp, app_id, platform, os_version, sdk_version, device_model, screen_size, country_code,
| language, strategy, ip, imei, mac, dev_id android_id, gaid, idfa, device_brand, getDevId(cdn_ab) idfv, ext_packagename package_name,
| getDevId(ext_sysid) sysid, ext_oaid oaid, getRuid(ext_algo) ruid
| FROM dwh.ods_adn_trackingnew_request WHERE yyyy = '$year' and mm = '$month' and dd = '$day' and hh = '$hh' AND re = '$region'
| UNION
| SELECT date, time, created timestamp, app_id, platform, os_version, sdk_version, device_model, screen_size, country_code,
| language, strategy, ip, imei, mac, dev_id android_id, gaid, idfa, device_brand, getDevId(cdn_ab) idfv, ext_packagename package_name,
| getDevId(ext_sysid) sysid, ext_oaid oaid, getRuid(ext_algo) ruid
| FROM dwh.ods_adn_trackingnew_request_tmp_hb_request WHERE yyyy = '$year' and mm = '$month' and dd = '$day' and hh = '$hh' AND re = '${region}_hb_request'
|""".stripMargin
s"""
|SELECT date, time, created timestamp, app_id, platform, os_version, sdk_version, device_model, screen_size, country_code,
| language, strategy, ip, imei, mac, dev_id android_id, gaid, idfa, device_brand, getDevId(cdn_ab) idfv, ext_packagename package_name,
| getDevId(ext_sysid) sysid, ext_oaid oaid, getRuid(ext_algo) ruid
| FROM dwh.ods_adn_trackingnew_request WHERE yyyy = '$year' and mm = '$month' and dd = '$day' and hh = '$hh' AND re = '$region'
| UNION
| SELECT date, time, created timestamp, app_id, platform, os_version, sdk_version, device_model, screen_size, country_code,
| language, strategy, ip, imei, mac, dev_id android_id, gaid, idfa, device_brand, getDevId(cdn_ab) idfv, ext_packagename package_name,
| getDevId(ext_sysid) sysid, ext_oaid oaid, getRuid(ext_algo) ruid
| FROM dwh.ods_adn_trackingnew_request_tmp_hb_request WHERE yyyy = '$year' and mm = '$month' and dd = '$day' and hh = '$hh' AND re = '${region}_hb_request'
|""".stripMargin
try {
spark.udf.register("getDevId", AdnConstant.getDevId _)
......@@ -85,8 +85,68 @@ class AdnOrgLogEtlHours extends CommonSparkJob with Serializable {
val oaid = r.getAs[String]("oaid")
val ruid = r.getAs[String]("ruid")
if ((StringUtils.isNotBlank(idfa) && idfa.matches(MobvistaConstant.didPtn)) ||
(StringUtils.isNotBlank(idfv) && idfv.matches(MobvistaConstant.didPtn)) || (StringUtils.isNotBlank(oaid) && oaid.matches(MobvistaConstant.oaidPtb)) ||
val f_idfa = if (StringUtils.isNotBlank(idfa) && (idfa.matches(didPtn) && !idfa.matches(allZero) || idfa.matches(md5Ptn))) {
idfa
} else {
""
}
val f_idfv = if (StringUtils.isNotBlank(idfv) && (idfv.matches(didPtn) && !idfv.matches(allZero) || idfv.matches(md5Ptn))) {
idfv
} else {
""
}
val f_imei = if (StringUtils.isNotBlank(imei) && (imei.matches(imeiPtn) && !imei.matches(imeiPtnAll) || imei.matches(md5Ptn))) {
imei
} else {
""
}
val f_androidId = if (StringUtils.isNotBlank(androidId) && (androidId.matches(andriodIdPtn) && !androidId.matches(andriodIdAll) || androidId.matches(md5Ptn))) {
androidId
} else {
""
}
val f_oaid = if (StringUtils.isNotBlank(oaid) && oaid.length >= 16 && oaid.length <= 64) {
oaid
} else {
""
}
val f_gaid = if (StringUtils.isNotBlank(gaid) && (gaid.matches(didPtn) && !gaid.matches(allZero) || gaid.matches(md5Ptn))) {
gaid
} else {
""
}
val f_sysId = if (StringUtils.isNotBlank(sysId) && (sysId.matches(didPtn) && !sysId.matches(allZero) || sysId.matches(md5Ptn))) {
sysId
} else {
""
}
var f_platform = if (StringUtils.isNotBlank(platform)) {
platform.toLowerCase()
} else {
""
}
f_platform = if (f_platform.contains("ios") || f_platform.contains("iphone") || deviceBrand.toLowerCase.contains("apple")
|| deviceModel.toLowerCase.contains("iphone") || deviceModel.toLowerCase.contains("ipad") || osVersion.toLowerCase.contains("ios")
|| StringUtils.isNotBlank(f_idfa) || StringUtils.isNotBlank(f_idfv)) {
"ios"
} else if (f_platform.contains("android") || osVersion.toLowerCase.contains("android") ||
StringUtils.isNotBlank(f_imei) || StringUtils.isNotBlank(f_androidId) || StringUtils.isNotBlank(f_oaid) || StringUtils.isNotBlank(f_gaid)) {
"android"
} else {
"other"
}
if ((StringUtils.isNotBlank(f_idfa) || StringUtils.isNotBlank(f_idfv) || StringUtils.isNotBlank(f_imei) || StringUtils.isNotBlank(f_androidId) ||
StringUtils.isNotBlank(f_oaid) || StringUtils.isNotBlank(f_gaid) || StringUtils.isNotBlank(f_sysId) || StringUtils.isNotBlank(ruid)) &&
!"other".equals(f_platform)) {
MRUtils.JOINER.join(date, time, timestamp, appId, f_platform, osVersion, sdkVersion, deviceModel, screenSize, countryCode,
language, ip, f_imei, mac, f_androidId, f_gaid, f_idfa, deviceBrand, f_sysId, packageName, strategy, f_oaid, f_idfv, ruid)
} else {
null
}
/*
if ((StringUtils.isNotBlank(idfa) && idfa.matches(MobvistaConstant.didPtn)) || (StringUtils.isNotBlank(idfv) && idfv.matches(MobvistaConstant.didPtn)) ||
(StringUtils.isNotBlank(sysId) && sysId.matches(MobvistaConstant.didPtn)) || (StringUtils.isNotBlank(ruid) && ruid.length > 16)) {
val plt = if (StringUtils.isNotBlank(platform)) {
platform
......@@ -108,6 +168,7 @@ class AdnOrgLogEtlHours extends CommonSparkJob with Serializable {
} else {
null
}
*/
}).filter(l => {
StringUtils.isNotBlank(l)
})
......
......@@ -112,55 +112,55 @@ class AdnRequestSdkEtlDaily extends CommonSparkJob with java.io.Serializable {
if (StringUtils.isNotBlank(ruid) && ruid.length > 16) {
linesArr += Row(ruid, "ruid", platform, appId, model, brand, osVersion, country, strategy, region, 1)
}
if (StringUtils.isNotBlank(idfa) && idfa.matches(mobvista.dmp.common.MobvistaConstant.didPtn) && !idfa.matches(mobvista.dmp.common.MobvistaConstant.allZero)) {
if (StringUtils.isNotBlank(idfa) && (idfa.matches(didPtn) && !idfa.matches(allZero) || idfa.matches(md5Ptn))) {
linesArr += Row(idfa, "idfa", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
if (StringUtils.isNotBlank(sysId)) {
linesArr += Row(sysId, "sysid", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
dev_tag = 0
if (StringUtils.isNotBlank(idfv) && idfv.matches(mobvista.dmp.common.MobvistaConstant.didPtn) && !idfv.matches(mobvista.dmp.common.MobvistaConstant.allZero)) {
if (StringUtils.isNotBlank(idfv) && (idfv.matches(didPtn) && !idfv.matches(allZero) || idfv.matches(md5Ptn))) {
linesArr += Row(idfv, "idfv", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
} else {
if (StringUtils.isNotBlank(sysId)) {
linesArr += Row(sysId, "sysid", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
if (StringUtils.isNotBlank(idfv) && idfv.matches(mobvista.dmp.common.MobvistaConstant.didPtn) && !idfv.matches(mobvista.dmp.common.MobvistaConstant.allZero)) {
if (StringUtils.isNotBlank(idfv) && (idfv.matches(didPtn) && !idfv.matches(allZero) || idfv.matches(md5Ptn))) {
linesArr += Row(idfv, "idfv", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
} else {
if (StringUtils.isNotBlank(idfv) && idfv.matches(mobvista.dmp.common.MobvistaConstant.didPtn) && !idfv.matches(mobvista.dmp.common.MobvistaConstant.allZero)) {
if (StringUtils.isNotBlank(idfv) && (idfv.matches(didPtn) && !idfv.matches(allZero) || idfv.matches(md5Ptn))) {
linesArr += Row(idfv, "idfv", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
}
}
case "android" =>
var dev_tag = 1
if (StringUtils.isNotBlank(gaid) && gaid.matches(mobvista.dmp.common.MobvistaConstant.didPtn) && !gaid.matches(mobvista.dmp.common.MobvistaConstant.allZero)) {
if (StringUtils.isNotBlank(gaid) && (gaid.matches(didPtn) && !gaid.matches(allZero) || gaid.matches(md5Ptn))) {
linesArr += Row(gaid, "gaid", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
if (StringUtils.isNotBlank(oaid) && oaid.matches(mobvista.dmp.common.MobvistaConstant.didPtn) && !oaid.matches(mobvista.dmp.common.MobvistaConstant.allZero)) {
if (StringUtils.isNotBlank(oaid) && (oaid.matches(didPtn) && !oaid.matches(allZero) || oaid.matches(md5Ptn))) {
linesArr += Row(oaid, "oaid", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
dev_tag = 0
if (StringUtils.isNotBlank(sysId)) {
linesArr += Row(sysId, "sysid", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
if (StringUtils.isNotBlank(imei) && imei.matches(mobvista.dmp.common.MobvistaConstant.imeiPtn)) {
if (StringUtils.isNotBlank(imei) && (imei.matches(imeiPtn) || imei.matches(md5Ptn))) {
linesArr += Row(imei, "imei", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
if (StringUtils.isNotBlank(androidId) && androidId.matches(mobvista.dmp.common.MobvistaConstant.andriodIdPtn)) {
if (StringUtils.isNotBlank(androidId) && (androidId.matches(andriodIdPtn) || androidId.matches(md5Ptn))) {
linesArr += Row(androidId, "androidId", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
} else {
if (StringUtils.isNotBlank(oaid) && oaid.matches(mobvista.dmp.common.MobvistaConstant.didPtn) && !oaid.matches(mobvista.dmp.common.MobvistaConstant.allZero)) {
if (StringUtils.isNotBlank(oaid) && (oaid.matches(didPtn) && !oaid.matches(allZero) || oaid.matches(md5Ptn))) {
linesArr += Row(oaid, "oaid", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
if (StringUtils.isNotBlank(imei) && imei.matches(mobvista.dmp.common.MobvistaConstant.imeiPtn) && "android".equals(platform)) {
if (StringUtils.isNotBlank(imei) && (imei.matches(imeiPtn) || imei.matches(md5Ptn))) {
if (dev_tag == 1) {
dev_tag = 0
}
linesArr += Row(imei, "imei", platform, appId, model, brand, osVersion, country, strategy, region, dev_tag)
}
if (StringUtils.isNotBlank(androidId) && androidId.matches(mobvista.dmp.common.MobvistaConstant.andriodIdPtn) && "android".equals(platform)) {
if (StringUtils.isNotBlank(androidId) && (androidId.matches(andriodIdPtn) || androidId.matches(md5Ptn))) {
if (dev_tag == 1) {
dev_tag = 0
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment