Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
mobvista-dmp
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
王金锋
mobvista-dmp
Commits
15972dc8
Commit
15972dc8
authored
May 28, 2021
by
wang-jinfeng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
optimize dmp
parent
fd0560d0
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
100 additions
and
192 deletions
+100
-192
rtdmp.job
azkaban/rtdmp/rtdmp.job
+0
-0
rtdmp.sh
azkaban/rtdmp/rtdmp.sh
+36
-14
Logic.scala
src/main/scala/mobvista/dmp/datasource/rtdmp/Logic.scala
+0
-3
RTDmpMain.scala
src/main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMain.scala
+64
-175
No files found.
azkaban/rtdmp/rtdmp
→
azkaban/rtdmp/rtdmp
.job
View file @
15972dc8
File moved
azkaban/rtdmp/rtdmp.sh
View file @
15972dc8
...
...
@@ -4,25 +4,47 @@ source ../dmp_env.sh
today
=
${
ScheduleTime
}
date_time
=
$(
date
+
"%Y-%m-%d
%H"
-d
"-1 hour
$today
"
)
date_time
=
$(
date
+
"%Y-%m-%d
.
%H"
-d
"-1 hour
$today
"
)
date_path
=
$(
date
+%Y/%m/%d/%H
-d
"-1 hour
$today
"
)
part_num
=
$(
hadoop fs
-ls
s3://mob-emr-test/dataplatform/rtdmp_pre/
${
date_path
}
/ |
wc
-l
)
if
[[
${
part_num
}
-le
50
]]
;
then
echo
"This Dir No Data !!!"
partition
=
10
coalesce
=
10
executor
=
2
memory
=
4
core
=
2
flag
=
0
else
partition
=
2000
coalesce
=
200
executor
=
8
memory
=
10
core
=
4
flag
=
1
fi
INPUT
=
"s3://mob-emr-test/dataplatform/rtdmp_pre/
${
date_path
}
"
OUTPUT
=
"s3://mob-emr-test/dataplatform/rtdmp_deal/
${
date_path
}
/0"
spark-submit
--class
mobvista.dmp.datasource.rtdmp.RTDmpMainDeal
\
--name
"RTDmpMainDeal.
${
date_time
}
"
\
--conf
spark.sql.shuffle.partitions
=
10000
\
--conf
spark.default.parallelism
=
500
\
--conf
spark.kryoserializer.buffer.max
=
256m
\
--conf
spark.speculation
=
true
\
--conf
spark.speculation.quantile
=
0.9
\
--conf
spark.speculation.multiplier
=
1.3
\
--conf
spark.executor.extraJavaOptions
=
"-XX:+UseG1GC"
\
--master
yarn
--deploy-mode
cluster
--executor-memory
4g
--driver-memory
4g
--executor-cores
4
--num-executors
50
\
../
${
JAR
}
-time
"
${
date_time
}
"
-data_utime
"
${
date_time
}
"
-input
${
INPUT
}
-output
${
OUTPUT
}
-coalesce
200
-partition
10000
OUTPUT
=
"s3://mob-emr-test/dataplatform/rtdmp_deal/
${
date_path
}
"
before_date_path
=
$(
date
+%Y/%m/%d/%H
-d
"-2 hour
$today
"
)
BEFORE_OUTPUT
=
"s3://mob-emr-test/dataplatform/rtdmp/
${
before_date_path
}
"
check_await
"
${
BEFORE_OUTPUT
}
/_SUCCESS"
spark-submit
--class
mobvista.dmp.datasource.rtdmp.RTDmpMain
\
--name
"RTDmpMain.
${
date_time
}
"
\
--conf
spark.sql.shuffle.partitions
=
${
partition
}
\
--conf
spark.default.parallelism
=
${
partition
}
\
--conf
spark.kryoserializer.buffer.max
=
512m
\
--conf
spark.kryoserializer.buffer
=
64m
\
--master
yarn
--deploy-mode
cluster
\
--executor-memory
${
memory
}
g
--driver-memory
6g
--executor-cores
${
core
}
--num-executors
${
executor
}
\
.././DMP.jar
\
-flag
${
flag
}
-time
${
date_time
}
-input
${
INPUT
}
-output
${
OUTPUT
}
-coalesce
${
coalesce
}
if
[[
$?
-ne
0
]]
;
then
exit
255
...
...
src/main/scala/mobvista/dmp/datasource/rtdmp/Logic.scala
View file @
15972dc8
...
...
@@ -31,7 +31,6 @@ import scala.collection.{immutable, mutable}
object
Logic
{
def
getResultFeature
(
session
:
CqlSession
,
iterator
:
Iterator
[
Row
])
:
Iterator
[
AudienceInfo
]
=
{
val
sql
=
"""
|select audience_data from rtdmp.audience_info where devid = '@devid'
...
...
@@ -39,7 +38,6 @@ object Logic {
val
res
=
new
ArrayBuffer
[
AudienceInfo
]()
iterator
.
foreach
(
row
=>
{
// val session = connector.openSession()
val
devId
=
row
.
getAs
[
String
](
0
)
val
audience_data
=
row
.
getAs
[
String
](
1
)
val
query_sql
=
sql
.
replace
(
"@devid"
,
devId
)
...
...
@@ -49,7 +47,6 @@ object Logic {
}
else
{
new
JSONObject
().
toJSONString
}
// session.close()
res
.
add
(
AudienceInfo
(
devId
,
audience_data
,
old_audience_data
))
})
res
.
iterator
()
...
...
src/main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMain.scala
View file @
15972dc8
package
mobvista.dmp.datasource.rtdmp
import
java.net.URI
import
java.util
import
com.alibaba.fastjson.
{
JSONArray
,
JSONObject
}
import
com.datastax.oss.driver.api.core.ConsistencyLevel
import
com.datastax.spark.connector._
import
com.datastax.spark.connector.cql.CassandraConnector
import
mobvista.dmp.common.
{
CommonSparkJob
,
MobvistaConstant
}
import
mobvista.dmp.util.
{
DateUtil
,
MD5Util
,
PropertyUtil
}
import
com.datastax.spark.connector.rdd.ReadConf
import
mobvista.dmp.common.CommonSparkJob
import
mobvista.dmp.datasource.rtdmp.Constant.NewAudienceInfo
import
mobvista.dmp.util.
{
DateUtil
,
PropertyUtil
}
import
org.apache.commons.cli.
{
BasicParser
,
Options
}
import
org.apache.commons.lang.StringUtils
import
org.apache.hadoop.fs.
{
FileSystem
,
Path
}
import
org.apache.spark.sql.
{
Row
,
SaveMode
,
SparkSession
}
import
org.apache.spark.sql.functions.col
import
org.apache.spark.sql.
{
SaveMode
,
SparkSession
}
import
org.apache.spark.storage.StorageLevel
import
java.net.URI
import
scala.collection.JavaConversions._
import
scala.collection.mutable
import
scala.collection.mutable.ArrayBuffer
import
scala.collection.JavaConverters._
/**
* @package: mobvista.dmp.datasource.rtdmp
...
...
@@ -31,12 +30,10 @@ class RTDmpMain extends CommonSparkJob with Serializable {
def
commandOptions
()
:
Options
=
{
val
options
=
new
Options
()
options
.
addOption
(
"time"
,
true
,
"time"
)
options
.
addOption
(
"input"
,
true
,
"input"
)
options
.
addOption
(
"output"
,
true
,
"output"
)
options
.
addOption
(
"coalesce"
,
true
,
"coalesce"
)
options
.
addOption
(
"partition"
,
true
,
"partition"
)
// options.addOption("update_time_start", true, "update_time_start")
// options.addOption("update_time_end", true, "update_time_end")
options
.
addOption
(
"data_utime"
,
true
,
"data_utime"
)
options
.
addOption
(
"flag"
,
true
,
"flag"
)
options
}
...
...
@@ -45,203 +42,95 @@ class RTDmpMain extends CommonSparkJob with Serializable {
val
parser
=
new
BasicParser
()
val
options
=
commandOptions
()
val
commandLine
=
parser
.
parse
(
options
,
args
)
val
time
=
commandLine
.
getOptionValue
(
"time"
)
val
time
=
commandLine
.
getOptionValue
(
"time"
).
replace
(
"."
,
" "
)
val
input
=
commandLine
.
getOptionValue
(
"input"
)
val
output
=
commandLine
.
getOptionValue
(
"output"
)
val
coalesce
=
commandLine
.
getOptionValue
(
"coalesce"
)
val
partition
=
commandLine
.
getOptionValue
(
"partition"
)
// var update_time_start = commandLine.getOptionValue("update_time_start")
// var update_time_end = commandLine.getOptionValue("update_time_end")
val
data_utime
=
commandLine
.
getOptionValue
(
"data_utime"
)
val
flag
=
Integer
.
parseInt
(
commandLine
.
getOptionValue
(
"flag"
))
val
system
=
"rtdmp"
val
region
=
"vg"
val
spark
:
SparkSession
=
SparkSession
.
builder
()
.
appName
(
s
"RTDmpMain.$time"
)
.
appName
(
s
"RTDmpMain
Spe
.$time"
)
.
config
(
"spark.rdd.compress"
,
"true"
)
.
config
(
"spark.io.compression.codec"
,
"snappy"
)
.
config
(
"spark.sql.orc.filterPushdown"
,
"true"
)
.
config
(
"spark.serializer"
,
"org.apache.spark.serializer.KryoSerializer"
)
.
config
(
"spark.cassandra.connection.host"
,
PropertyUtil
.
getProperty
(
"ip.properties"
,
"aws_host"
))
.
config
(
"spark.cassandra.connection.factory"
,
s
"mobvista.dmp.utils.cassandra.$system.${region.toUpperCase}Factory"
)
.
config
(
"spark.cassandra.connection.host"
,
PropertyUtil
.
getProperty
(
"ip.properties"
,
s
"$system.$region.host"
))
.
config
(
"spark.cassandra.connection.port"
,
"9042"
)
.
config
(
"spark.cassandra.connection.factory"
,
s
"mobvista.dmp.utils.cassandra.AWSFactory"
)
.
config
(
"spark.cassandra.connection.connections_per_executor_max"
,
"512"
)
.
config
(
"spark.cassandra.output.concurrent.writes"
,
"32"
)
.
config
(
"spark.cassandra.concurrent.reads"
,
"1024"
)
.
config
(
"spark.cassandra.connection.remoteConnectionsPerExecutor"
,
"64"
)
.
config
(
"spark.cassandra.connection.localConnectionsPerExecutor"
,
"32"
)
.
config
(
"spark.cassandra.query.retry.count"
,
"10"
)
.
config
(
"spark.cassandra.connection.compression"
,
"LZ4"
)
.
config
(
"spark.cassandra.input.consistency.level"
,
"LOCAL_ONE"
)
.
config
(
"spark.cassandra.output.consistency.level"
,
"LOCAL_ONE"
)
.
config
(
"spark.cassandra.input.fetch.sizeInRows"
,
"2048"
)
.
config
(
"spark.cassandra.concurrent.reads"
,
"2048"
)
.
config
(
"spark.cassandra.output.concurrent.writes"
,
"16"
)
.
config
(
"spark.cassandra.output.batch.grouping.buffer.size"
,
"1024"
)
.
config
(
"spark.cassandra.connection.keep_alive_ms"
,
"600000"
)
.
config
(
"spark.cassandra.output.batch.size.bytes"
,
"1024"
)
.
config
(
"spark.cassandra.connection.keepAliveMS"
,
"60000"
)
.
config
(
"spark.cassandra.auth.username"
,
"U&6zBV$*wBuYUpJRq$hp"
)
.
config
(
"spark.cassandra.auth.password"
,
"Z8tzjTMBe^M2#hat$nAJ"
)
.
getOrCreate
()
// .config("spark.cassandra.input.consistency.level", "LOCAL_QUORUM")
// .config("spark.cassandra.output.consistency.level", "LOCAL_QUORUM")
// .config("spark.cassandra.connection.connections_per_executor_max", "8")
val
sc
=
spark
.
sparkContext
try
{
var
mergeRDD
=
sc
.
emptyRDD
[(
String
,
(
Int
,
String
))]
import
spark.implicits._
if
(
flag
==
1
)
{
val
expire_time
=
DateUtil
.
getDayByString
(
time
,
"yyyy-MM-dd HH"
,
-
7
)
// 默认计算上个小时的数据
val
update_time_start
=
DateUtil
.
format
(
time
+
":00:00"
,
"yyyy-MM-dd HH:mm:ss"
)
val
update_time_end
=
DateUtil
.
format
(
time
+
":59:59"
,
"yyyy-MM-dd HH:mm:ss"
)
val
audience_date_utime_start
=
DateUtil
.
parse
(
data_utime
,
"yyyy-MM-dd HH"
).
getTime
/
1000
-
28800
val
audience_date_utime_end
=
DateUtil
.
parse
(
data_utime
,
"yyyy-MM-dd HH"
).
getTime
/
1000
-
25200
val
map
:
util.Map
[
Integer
,
(
JSONArray
,
Integer
,
Integer
,
JSONObject
)]
=
ServerUtil
.
request
(
update_time_start
,
update_time_end
,
audience_date_utime_start
,
audience_date_utime_end
,
0
,
0
,
2
)
println
(
s
"map -->> $map"
)
map
.
foreach
(
t
=>
{
val
audienceId
=
Integer2int
(
t
.
_1
)
val
audienceOp
=
t
.
_2
.
_2
val
dmap
=
new
mutable
.
HashMap
[
String
,
String
]()
t
.
_2
.
_1
.
foreach
(
json
=>
{
val
jsonObject
=
json
.
asInstanceOf
[
JSONObject
]
if
(
jsonObject
.
containsKey
(
"s3_path"
)
&&
StringUtils
.
isNotBlank
(
jsonObject
.
getString
(
"s3_path"
)))
{
// (s3_path, update_date)
dmap
.
put
(
jsonObject
.
getString
(
"s3_path"
),
jsonObject
.
getString
(
"update_time"
))
}
})
// 判断所有 s3_path 对应的 update_date 中是否有 当前 update_date,进行有效过滤
// if (dmap.values.contains(time)) {
/**
* audienceOp == 0 and dmap.size >= 2 即做差集计算,表示在上个分区出现,这个分区不出现的设备对应的安装包置为 -1 * audienceId,
* 用于下游删除;否则与前一个人群包进行合并操作
*/
val
updateRDD
=
if
(
audienceOp
==
1
&&
dmap
.
size
>=
2
)
{
val
list
=
dmap
.
toList
.
sortWith
(
_
.
_2
>
_
.
_2
).
take
(
1
)
// 按 update_date 进行降序排序,提取最前面的两个人群包
val
newAudience
=
sc
.
textFile
(
list
.
get
(
0
).
_1
).
map
(
r
=>
{
// First 为最新的人群包
val
device_id
=
if
(
r
.
matches
(
MobvistaConstant
.
md5Ptn
))
{
r
}
else
{
MD5Util
.
getMD5Str
(
r
)
}
(
device_id
,
(
audienceId
,
list
.
get
(
0
).
_2
))
})
/*
val oldAudience = sc.textFile(list.get(1)._1).map(r => { // Second 为旧的人群包,表示上一版本人群包
val device_id =
if (r.matches(MobvistaConstant.md5Ptn)) {
r
} else {
MD5Util.getMD5Str(r)
}
(device_id, (audienceId, list.get(1)._2))
})
oldAudience.subtractByKey(newAudience).map(t => {
// 对差集 audienceId * -1,用于下游计算删除 audienceId
val device_id =
if (t._1.matches(MobvistaConstant.md5Ptn)) {
t._1
} else {
MD5Util.getMD5Str(t._1)
}
(device_id, ((-1) * audienceId, t._2._2))
// (devId, ((-1) * audienceId, update_date))
}).union(newAudience) // 与最新的人群包进行合并操作
*/
newAudience
}
else
{
val
audData
=
dmap
.
toList
.
sortWith
(
_
.
_2
>
_
.
_2
)
if
(
audData
.
nonEmpty
)
{
sc
.
textFile
(
audData
.
get
(
0
).
_1
).
map
(
r
=>
{
// 取出最新的人群包
val
device_id
=
if
(
r
.
matches
(
MobvistaConstant
.
md5Ptn
))
{
r
}
else
{
MD5Util
.
getMD5Str
(
r
)
}
(
device_id
,
(
audienceId
,
audData
.
get
(
0
).
_2
))
})
}
else
{
// 如果没有,则创建 空RDD
sc
.
emptyRDD
[(
String
,
(
Int
,
String
))]
}
}
// 所有人群包进行合并操作
mergeRDD
=
mergeRDD
.
union
(
updateRDD
)
// }
})
val
keyspace
=
"rtdmp"
val
tableName
=
"audience_info"
val
columns
=
SomeColumns
(
"devid"
,
"audience_data"
,
"update_time"
)
val
cassandraConnector
=
CassandraConnector
(
sc
.
getConf
)
val
set
=
ServerUtil
.
request
(
update_time_start
,
update_time_end
,
audience_date_utime_start
,
audience_date_utime_end
,
0
,
0
,
4
)
.
retain
((
_
,
v
)
=>
v
.
_2
==
1
)
.
keySet
val
cassandraConnector
=
CassandraConnector
(
sc
.
getConf
)
println
(
"audienceIds -->> "
+
set
.
mkString
(
","
))
val
df
=
mergeRDD
.
groupByKey
().
map
(
r
=>
{
val
devId
=
r
.
_1
val
jsonObject
=
new
JSONObject
()
// 生成 audienceId -> update_date JSONObject
r
.
_2
.
foreach
(
t
=>
{
jsonObject
.
put
(
t
.
_1
.
toString
,
t
.
_2
)
})
Row
(
devId
,
jsonObject
.
toJSONString
,
time
)
}).
repartition
(
partition
.
toInt
)
.
mapPartitions
(
it
=>
cassandraConnector
.
withSessionDo
(
session
=>
{
Logic
.
getResultFeature
(
session
,
it
)
}))
.
mapPartitions
(
new
CustomIteratorAudienceInfo
(
_
,
time
,
expire_time
,
set
))
object
ReadConfigurationOne
{
implicit
val
readConf
=
ReadConf
(
Option
(
10000
),
5
,
2048
,
ConsistencyLevel
.
LOCAL_ONE
,
true
)
}
// .mapPartitions(Logic.parseAudienceInfo(_, expire_time)
)
val
selectDF
=
spark
.
read
.
orc
(
input
)
df
.
persist
(
StorageLevel
.
MEMORY_AND_DISK_SER
)
// 仅更新上个小时的数据
val
update_time_start
=
DateUtil
.
format
(
time
+
":00:00"
,
"yyyy-MM-dd HH:mm:ss"
)
val
update_time_end
=
DateUtil
.
format
(
time
+
":59:59"
,
"yyyy-MM-dd HH:mm:ss"
)
val
audience_date_utime_start
=
DateUtil
.
parse
(
time
+
":00:00"
,
"yyyy-MM-dd HH:mm:ss"
).
getTime
/
1000
-
28800
val
audience_date_utime_end
=
DateUtil
.
parse
(
time
+
":59:59"
,
"yyyy-MM-dd HH:mm:ss"
).
getTime
/
1000
-
28800
df
.
saveToCassandra
(
keyspace
,
tableName
,
columns
)
val
update_ids
=
ServerUtil
.
request
(
update_time_start
,
update_time_end
,
audience_date_utime_start
,
audience_date_utime_end
,
0
,
0
,
2
)
.
asScala
.
keySet
val
audienceSum
=
df
.
map
(
r
=>
{
val
array
=
new
ArrayBuffer
[(
Int
,
Int
)]()
MobvistaConstant
.
String2JSONObject
(
r
.
audience_data
).
keySet
().
foreach
(
k
=>
{
if
(
map
.
keySet
().
contains
(
Integer
.
parseInt
(
k
)))
{
array
.
add
((
Integer
.
parseInt
(
k
),
1
))
}
})
array
.
iterator
}).
flatMap
(
l
=>
l
)
.
countByKey
()
val
audience_output
=
output
+
"/audience"
FileSystem
.
get
(
new
URI
(
s
"s3://mob-emr-test"
),
sc
.
hadoopConfiguration
).
delete
(
new
Path
(
audience_output
),
true
)
val
df
=
selectDF
.
mapPartitions
(
it
=>
cassandraConnector
.
withSessionDo
(
session
=>
{
Logic
.
getResultFeature
(
session
,
it
)
})).
toDF
.
select
(
col
(
"devid"
),
col
(
"audience_data"
).
alias
(
"audience_ids"
),
col
(
"audience_data"
))
.
rdd
.
mapPartitions
(
v
=>
new
CustomMapPartition
(
v
,
update_time
=
time
,
expire_time
,
update_ids
))
sc
.
parallelize
(
audienceSum
.
toList
).
coalesce
(
1
).
saveAsTextFile
(
audience_output
)
df
.
persist
(
StorageLevel
.
MEMORY_AND_DISK_SER
)
val
data_output
=
output
+
"/data"
FileSystem
.
get
(
new
URI
(
s
"s3://mob-emr-test"
),
sc
.
hadoopConfiguration
).
delete
(
new
Path
(
data_output
),
true
)
FileSystem
.
get
(
new
URI
(
s
"s3://mob-emr-test"
),
sc
.
hadoopConfiguration
).
delete
(
new
Path
(
output
),
true
)
import
spark.implicits._
df
.
mapPartitions
(
Logic
.
writeResult
(
cassandraConnector
,
_
))
.
repartition
(
coalesce
.
toInt
)
df
.
repartition
(
coalesce
.
toInt
)
.
toDF
.
write
.
mode
(
SaveMode
.
Overwrite
)
.
option
(
"orc.compress"
,
"zlib"
)
.
orc
(
data_output
)
// .mapPartitions(Logic.writeResult(cassandraConnector, _))
/*
.mapPartitions(Logic.parseResult(data_output, _))
.repartition(coalesce.toInt)
.saveAsNewAPIHadoopFile(data_output, classOf[Text], classOf[Text], classOf[RDDMultipleOutputFormat[_, _]])
.
orc
(
output
)
val jsonArray = new JSONArray()
audienceSum.foreach(m => {
val jsonObject = new JSONObject()
jsonObject.put("id", m._1)
jsonObject.put("audience_data_status", 2)
jsonObject.put("audience_count", m._2)
jsonArray.add(jsonObject)
})
df
.
saveToCassandra
(
keyspace
,
tableName
,
columns
)
}
else
{
FileSystem
.
get
(
new
URI
(
s
"s3://mob-emr-test"
),
sc
.
hadoopConfiguration
).
delete
(
new
Path
(
output
),
true
)
val jsonObject = ServerUtil.update(jsonArray)
if (jsonObject.getInteger("code") == 200) {
println("Audience Update OK!")
Seq
.
empty
[
NewAudienceInfo
].
toDF
.
write
.
mode
(
SaveMode
.
Overwrite
)
.
option
(
"orc.compress"
,
"zlib"
)
.
orc
(
output
)
}
*/
}
finally
{
if
(
sc
!=
null
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment