Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
mobvista-dmp
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
王金锋
mobvista-dmp
Commits
356a6eab
Commit
356a6eab
authored
Jul 27, 2021
by
WangJinfeng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix rtdmp bug,appid_package bug
parent
88bfe623
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
29 additions
and
11 deletions
+29
-11
user_info_ck.sh
azkaban/datatory/user_info/user_info_ck.sh
+0
-2
rtdmp.sh
azkaban/rtdmp/rtdmp.sh
+5
-3
AppidPackageDictMR.java
...a/mobvista/dmp/datasource/setting/AppidPackageDictMR.java
+2
-1
RTDmpMain.scala
src/main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMain.scala
+22
-5
No files found.
azkaban/datatory/user_info/user_info_ck.sh
View file @
356a6eab
...
...
@@ -15,8 +15,6 @@ INTPUT="s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/dm_user_info/${dat
check_await
${
INTPUT
}
/_SUCCESS
checkRTDmp
${
today
}
sleep
60
host
=
"ip-172-31-20-35.ec2.internal"
...
...
azkaban/rtdmp/rtdmp.sh
View file @
356a6eab
...
...
@@ -4,12 +4,14 @@ source ../dmp_env.sh
today
=
${
ScheduleTime
}
date_time
=
$(
date
+
"%Y%m%d%H"
-d
"-
2
hour
$today
"
)
date_time
=
$(
date
+
"%Y%m%d%H"
-d
"-
1
hour
$today
"
)
date_path
=
$(
date
+%Y/%m/%d/%H
-d
"-1 hour
$today
"
)
INPUT
=
"s3://mob-emr-test/dataplatform/rtdmp_pre/
${
date_path
}
"
old_date_time
=
$(
date
+
"%Y%m%d%H"
-d
"-2 hour
$today
"
)
old_date_path
=
$(
date
+%Y/%m/%d/%H
-d
"-2 hour
$today
"
)
OLD_MERGE_INPUT
=
"s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/audience_merge/
${
old_date_path
}
"
...
...
@@ -27,13 +29,13 @@ spark-submit --class mobvista.dmp.datasource.rtdmp.RTDmpMain \
--master
yarn
--deploy-mode
cluster
\
--executor-memory
18g
--driver-memory
6g
--executor-cores
5
--num-executors
40
\
.././DMP.jar
\
-datetime
${
date_time
}
-input
${
INPUT
}
-output
${
OUTPUT
}
-coalesce
200
-datetime
${
date_time
}
-
old_datetime
${
old_date_time
}
-
input
${
INPUT
}
-output
${
OUTPUT
}
-coalesce
200
if
[[
$?
-ne
0
]]
;
then
exit
255
fi
mount_partition
"audience_merge"
"dt='
${
curr
_time
}
'"
"
$OUTPUT
"
mount_partition
"audience_merge"
"dt='
${
date
_time
}
'"
"
$OUTPUT
"
expire_time
=
$(
date
+
"%Y%m%d%H"
-d
"-24 hour
$today
"
)
...
...
src/main/java/mobvista/dmp/datasource/setting/AppidPackageDictMR.java
View file @
356a6eab
...
...
@@ -70,7 +70,8 @@ public class AppidPackageDictMR extends CommonMapReduce {
CommonMapReduce
.
setMetrics
(
context
,
"DMP"
,
"column_num_error"
,
1
);
return
;
}
if
(
array
.
length
==
3
&&
!
"null"
.
equals
(
array
[
1
]))
{
//原 appid, package_name, platform
if
(
array
.
length
==
3
&&
!
"null"
.
equals
(
array
[
1
])
&&
appidPtn
.
matcher
(
array
[
0
]).
matches
()
&&
platformPtn
.
matcher
(
array
[
2
]).
matches
())
{
//原 appid, package_name, platform
outKey
.
set
(
array
[
0
]);
outValue
.
set
(
joiner
.
join
(
"A"
,
array
[
1
],
array
[
2
]));
context
.
write
(
outKey
,
outValue
);
...
...
src/main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMain.scala
View file @
356a6eab
...
...
@@ -3,6 +3,7 @@ package mobvista.dmp.datasource.rtdmp
import
com.alibaba.fastjson.JSONObject
import
mobvista.dmp.common.
{
CommonSparkJob
,
MobvistaConstant
}
import
mobvista.dmp.datasource.rtdmp.Constant.AudienceMerge
import
mobvista.dmp.util.DateUtil
import
org.apache.commons.cli.
{
BasicParser
,
Options
}
import
org.apache.spark.sql.
{
SaveMode
,
SparkSession
}
import
org.codehaus.jackson.map.ObjectMapper
...
...
@@ -24,6 +25,7 @@ class RTDmpMain extends CommonSparkJob with Serializable {
def
commandOptions
()
:
Options
=
{
val
options
=
new
Options
()
options
.
addOption
(
"datetime"
,
true
,
"datetime"
)
options
.
addOption
(
"old_datetime"
,
true
,
"old_datetime"
)
options
.
addOption
(
"input"
,
true
,
"input"
)
options
.
addOption
(
"output"
,
true
,
"output"
)
options
.
addOption
(
"coalesce"
,
true
,
"coalesce"
)
...
...
@@ -36,12 +38,13 @@ class RTDmpMain extends CommonSparkJob with Serializable {
val
options
=
commandOptions
()
val
commandLine
=
parser
.
parse
(
options
,
args
)
val
datetime
=
commandLine
.
getOptionValue
(
"datetime"
)
val
old_datetime
=
commandLine
.
getOptionValue
(
"old_datetime"
)
val
input
=
commandLine
.
getOptionValue
(
"input"
)
val
output
=
commandLine
.
getOptionValue
(
"output"
)
val
coalesce
=
commandLine
.
getOptionValue
(
"coalesce"
)
val
spark
:
SparkSession
=
SparkSession
.
builder
()
.
appName
(
s
"RTDmpMain"
)
.
appName
(
s
"RTDmpMain
.${datetime}
"
)
.
config
(
"spark.rdd.compress"
,
"true"
)
.
config
(
"spark.io.compression.codec"
,
"snappy"
)
.
config
(
"spark.sql.orc.filterPushdown"
,
"true"
)
...
...
@@ -52,7 +55,21 @@ class RTDmpMain extends CommonSparkJob with Serializable {
val
sc
=
spark
.
sparkContext
try
{
val
sdf
=
new
SimpleDateFormat
(
"yyyyMMddHH"
)
var
sdf
=
new
SimpleDateFormat
(
"yyyyMMddHHmmss"
)
// 默认计算上个小时的数据
val
update_time_start
=
DateUtil
.
format
(
sdf
.
parse
(
datetime
+
"0000"
),
"yyyy-MM-dd HH:mm:ss"
)
val
update_time_end
=
DateUtil
.
format
(
sdf
.
parse
(
datetime
+
"5959"
),
"yyyy-MM-dd HH:mm:ss"
)
val
audience_date_utime_start
=
sdf
.
parse
(
datetime
+
"0000"
).
getTime
/
1000
-
28800
val
audience_date_utime_end
=
sdf
.
parse
(
datetime
+
"5959"
).
getTime
/
1000
-
25200
val
updateAudienceIds
=
ServerUtil
.
request
(
update_time_start
,
update_time_end
,
audience_date_utime_start
,
audience_date_utime_end
,
0
,
1
,
2
)
.
asScala
.
keys
.
toSet
println
(
s
"updateAudienceIds -->> ${updateAudienceIds.mkString("
,
")}"
)
sdf
=
new
SimpleDateFormat
(
"yyyyMMddHH"
)
val
calendar
=
Calendar
.
getInstance
()
val
date
=
sdf
.
parse
(
datetime
)
calendar
.
setTime
(
date
)
...
...
@@ -72,7 +89,7 @@ class RTDmpMain extends CommonSparkJob with Serializable {
val
sql
=
s
"""
|SELECT * FROM dwh.audience_merge WHERE dt = '$datetime'
|SELECT * FROM dwh.audience_merge WHERE dt = '$
old_
datetime'
|"""
.
stripMargin
val
merge_rdd
=
spark
.
sql
(
sql
).
rdd
.
map
(
row
=>
{
...
...
@@ -95,7 +112,7 @@ class RTDmpMain extends CommonSparkJob with Serializable {
val
new_audience
=
MobvistaConstant
.
String2JSONObject
(
opt1
.
get
.
_1
).
asInstanceOf
[
java.util.Map
[
String
,
String
]]
val
old_audience
=
opt2
.
get
.
_1
val
retain_old_audience
=
MobvistaConstant
.
String2JSONObject
(
old_audience
).
asInstanceOf
[
java.util.Map
[
String
,
String
]].
asScala
.
retain
((
k
,
v
)
=>
!
new_audience
.
keySet
().
contains
(
k
)
&&
v
.
compareTo
(
expire_time
)
>
0
)
.
retain
((
k
,
v
)
=>
!
updateAudienceIds
.
contains
(
k
.
toInt
)
&&
!
new_audience
.
keySet
().
contains
(
k
)
&&
v
.
compareTo
(
expire_time
)
>
0
)
new_audience
.
putAll
(
retain_old_audience
.
asJava
)
AudienceMerge
(
devid
,
new
ObjectMapper
().
writeValueAsString
(
new_audience
),
datetime
,
opt1
.
get
.
_2
)
}
else
if
(
opt1
.
nonEmpty
&&
opt2
.
isEmpty
)
{
...
...
@@ -103,7 +120,7 @@ class RTDmpMain extends CommonSparkJob with Serializable {
}
else
{
val
old_audience
=
opt2
.
get
.
_1
val
retain_old_audience
=
MobvistaConstant
.
String2JSONObject
(
old_audience
).
asInstanceOf
[
java.util.Map
[
String
,
String
]].
asScala
.
retain
((
_
,
v
)
=>
v
.
compareTo
(
expire_time
)
>
0
)
.
retain
((
k
,
v
)
=>
!
updateAudienceIds
.
contains
(
k
.
toInt
)
&&
v
.
compareTo
(
expire_time
)
>
0
)
AudienceMerge
(
devid
,
new
ObjectMapper
().
writeValueAsString
(
retain_old_audience
.
asJava
),
opt2
.
get
.
_2
,
opt2
.
get
.
_3
)
}
})
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment