1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
package mobvista.dmp.datasource.clever
import java.net.{URI, URLDecoder}
import java.util
import mobvista.dmp.common.{CommonSparkJob, UrlParser}
import mobvista.prd.datasource.util.GsonUtil
import org.apache.commons.lang.StringUtils
import org.apache.commons.net.util.Base64
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer
class ParseCleverDaily extends CommonSparkJob with Serializable {
override val DATA_SPLIT = " "
val FIELD_SPLIT = "\t"
val urlParser = new UrlParser
var word = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=="
var secret = "uVUoXc3pCnDFvb8lNJj9ZHEia7QYrfSmROkGKA0ehIdtzB64Mq2gP5syTL1wWx+/=="
val pDict = new util.HashMap[Character, Character]
for (i <- 0 until word.length) {
pDict.put(secret.charAt(i), word.charAt(i))
}
word = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=="
secret = "vSoajc7dRzpWifGyNxZnV5k+DHLYhJ46lt0U3QrgEuq8sw/XMeBAT2Fb9P1OIKmC=="
val cleverDict = new util.HashMap[Character, Character]
for (i <- 0 until word.length) {
cleverDict.put(secret.charAt(i), word.charAt(i))
}
def run(args: Array[String]): Int = {
var sc: SparkContext = null
try {
val commandLine = commParser.parse(options, args)
if (!checkMustOption(commandLine)) {
printOptions(commandLine)
return 1
} else {
printOptions(commandLine)
}
val input = commandLine.getOptionValue("input")
val output = commandLine.getOptionValue("output")
val parallelism = commandLine.getOptionValue("parallelism").toInt
val coalesce = commandLine.getOptionValue("coalesce").toInt
val spark = SparkSession.builder()
.appName("ParseCleverDaily")
.config("spark.rdd.compress", "true")
.config("spark.default.parallelism", s"$parallelism")
.config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate()
import spark.implicits._
sc = spark.sparkContext
FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)
val data = sc.textFile(input)
val resultRDD = data.filter(record => record.contains("/openapi/clever?p="))
.map(record => {
// 获取参数P的值
val splits = StringUtils.splitPreserveAllTokens(record, DATA_SPLIT, -1)
var url = splits(6)
url = s"$HTTPPREFIX/$url"
urlParser.evaluate(url, QUERY, "p")
})
.filter(StringUtils.isNotEmpty(_))
.map(pValue => {
// 对P参数值进行Base64解码,并拼接成url
val pDecode = decodeParams(pValue, pDict)
s"$HTTPPREFIX/test?${pDecode}"
})
.filter(url => {
// 1、过滤掉gaid或clever为null的数据
// 2、过滤到clever解密后不是jsonArray的数据
val gaid = urlParser.evaluate(url, QUERY, "gaid")
val clever = urlParser.evaluate(url, QUERY, "clever")
if (StringUtils.isNotEmpty(gaid) && StringUtils.isNotEmpty(clever)) {
try {
val cleverDecode = decodeParams(URLDecoder.decode(clever, ENCODING), cleverDict)
if (cleverDecode.trim.startsWith("[") && cleverDecode.trim.endsWith("]")) {
true
} else {
false
}
} catch {
case e: Exception => {
e.printStackTrace()
false
}
}
} else {
false
}
})
.flatMap(parseClever(_))
.distinct()
.coalesce(coalesce, true)
.toDF()
.write
.format("ORC")
.option("orc.compress", "zlib")
.save(output)
} finally {
if (sc != null) {
sc.stop()
}
}
0
}
def parseClever(url: String): Array[CleverVO] = {
val buffer = new ArrayBuffer[CleverVO]()
val gaid = urlParser.evaluate(url, QUERY, "gaid")
val model = urlParser.evaluate(url, QUERY, "model")
val brand = urlParser.evaluate(url, QUERY, "brand")
val clever = urlParser.evaluate(url, QUERY, "clever")
val cleverDecode = decodeParams(URLDecoder.decode(clever, ENCODING), cleverDict)
try {
val jsonArray = GsonUtil.String2JsonArray(cleverDecode)
jsonArray.foreach(element => {
val jsonObject = element.getAsJsonObject
val packageName = jsonObject.get("p").getAsString.replaceAll("[\\s\r\n]+", "")
buffer += CleverVO(gaid, "gaid", "android", model, brand, packageName)
})
} catch {
case e: Exception => {
println("sdfsdfsdfs= " + cleverDecode)
println("url= " + url)
e.printStackTrace()
}
}
buffer.toArray
}
def decodeParams(str: String, dict: util.Map[Character, Character]): String = {
val array = str.toCharArray
for (i <- 0 until array.length()) {
try {
array(i) = dict.get(array(i))
} catch {
case e: Exception => {
println(str)
println(i)
e.printStackTrace()
}
}
}
val decode = String.valueOf(array)
return new String(Base64.decodeBase64(decode))
}
}
case class CleverVO(device_id: String, device_type: String, platform: String, model: String, brand: String, package_name: String) {
override def hashCode() = {
this.device_id.hashCode + this.device_type.hashCode
}
override def equals(obj: scala.Any): Boolean = {
if (obj.isInstanceOf[CleverVO]) {
val tmp = obj.asInstanceOf[CleverVO]
return tmp.device_id.equals(this.device_id) && tmp.device_type.equals(this.device_type)
}
return false
}
}
object ParseCleverDaily {
def main(args: Array[String]): Unit = {
new ParseCleverDaily().run(args)
}
}