1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
package mobvista.dmp.demo
import java.net.URI
import mobvista.prd.datasource.util.GsonUtil
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer
/**
* Created by fl on 2017/7/13.
*/
class FindBundleId extends Serializable {
val regex = "^[0-9]+$"
def run(args: Array[String]): Int = {
val conf = new SparkConf().setAppName("Find")
val sc = new SparkContext(conf)
FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(args(1)), true)
val data = sc.textFile(args(0))
val filterData = data.filter(doFilter(_))
val resultData = filterData.flatMap(line => {
val arrayBuffer = new ArrayBuffer[String]();
val srr = StringUtils.splitPreserveAllTokens(line, "\t", -1)
val tags = srr(3)
val jsonArray = GsonUtil.String2JsonArray(tags)
for (element <- jsonArray) {
val obj = element.getAsJsonObject
if (!obj.get("package_name").getAsString.matches(regex)) {
arrayBuffer += obj.get("package_name").getAsString
}
}
arrayBuffer.toArray[String]
})
resultData.distinct(50).saveAsTextFile(args(1))
sc.stop()
return 0;
}
def doFilter(line: String): Boolean = {
val srr = StringUtils.splitPreserveAllTokens(line, "\t", -1)
if ("ios".equals(srr(2))) {
val jsonArray = GsonUtil.String2JsonArray(srr(3))
for (element <- jsonArray) {
val obj = element.getAsJsonObject
if (!obj.get("package_name").getAsString.matches(regex)) {
return true
}
}
}
return false
}
}
object FindBundleId {
def main(args: Array[String]): Unit = {
new FindBundleId().run(args)
}
}