Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
mobvista-dmp
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
王金锋
mobvista-dmp
Commits
14b970fb
Commit
14b970fb
authored
Oct 15, 2021
by
WangJinfeng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix dmp bug
parent
13331718
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
52 additions
and
50 deletions
+52
-50
dm_active_tag_month.sh
azkaban/dm/dm_active_tag_month.sh
+1
-1
dm_active_tag_week.sh
azkaban/dm/dm_active_tag_week.sh
+1
-1
Ga_purchase_event.sh
azkaban/event_tag/Ga_purchase_event.sh
+1
-2
dm_realtime_service.sh
azkaban/realtime/dm_realtime_service.sh
+6
-6
Logic.scala
...main/scala/mobvista/dmp/datasource/age_gender/Logic.scala
+11
-11
Ga_purchase_event.scala
...mobvista/dmp/datasource/event_tag/Ga_purchase_event.scala
+3
-3
DeviceInfoJob.scala
...a/mobvista/dmp/datasource/retargeting/DeviceInfoJob.scala
+29
-26
No files found.
azkaban/dm/dm_active_tag_month.sh
View file @
14b970fb
...
...
@@ -64,7 +64,7 @@ spark-submit --class mobvista.dmp.datasource.dm.ActiveTag \
--conf
spark.sql.files.maxPartitionBytes
=
268435456
\
--conf
spark.sql.adaptive.enabled
=
true
\
--conf
spark.sql.adaptive.advisoryPartitionSizeInBytes
=
268435456
\
--master
yarn
--deploy-mode
cluster
--executor-memory
1
8
g
--driver-memory
4g
--executor-cores
5
--num-executors
60
\
--master
yarn
--deploy-mode
cluster
--executor-memory
1
2
g
--driver-memory
4g
--executor-cores
5
--num-executors
60
\
../
${
JAR
}
\
-date
${
date
}
-output
${
OUTPUT_PATH
}
-coalesce
1000
-days
29
...
...
azkaban/dm/dm_active_tag_week.sh
View file @
14b970fb
...
...
@@ -66,7 +66,7 @@ spark-submit --class mobvista.dmp.datasource.dm.ActiveTag \
--conf
spark.sql.files.maxPartitionBytes
=
268435456
\
--conf
spark.sql.adaptive.enabled
=
true
\
--conf
spark.sql.adaptive.advisoryPartitionSizeInBytes
=
268435456
\
--master
yarn
--deploy-mode
cluster
--executor-memory
1
8g
--driver-memory
4g
--executor-cores
5
--num-executors
4
0
\
--master
yarn
--deploy-mode
cluster
--executor-memory
1
2g
--driver-memory
4g
--executor-cores
5
--num-executors
6
0
\
../
${
JAR
}
\
-date
${
date
}
-output
${
OUTPUT_PATH
}
-coalesce
1000
-days
6
...
...
azkaban/event_tag/Ga_purchase_event.sh
View file @
14b970fb
#
!/bin/sh
#!/bin/sh
source
../dmp_env.sh
## date=$1
## deal_time="";
...
...
@@ -26,7 +26,6 @@ deal_time=$(date -d "$ScheduleTime 1 days ago" +"%Y%m%d")
check_await
"
${
GA_DAILY_PATH
}
/
$date_path
/_SUCCESS"
echo
"sdsdsds"
echo
"
$deal_time
"
spark-submit
--class
mobvista.dmp.datasource.event_tag.Ga_purchase_event
\
--conf
spark.serializer
=
org.apache.spark.serializer.KryoSerializer
\
...
...
azkaban/realtime/dm_realtime_service.sh
View file @
14b970fb
...
...
@@ -26,21 +26,21 @@ sleep 30
output_path
=
"s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/dm_user_info/
${
date_path
}
"
unmount_output_path
=
"s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/dm_user_info/
${
unmount_date_path
}
"
#
export SPARK_HOME="/data/hadoop-home/engineplus-k8s-spark-3.0.0-hadoop3.2"
export
SPARK_HOME
=
"/data/hadoop-home/engineplus-k8s-spark-3.0.0-hadoop3.2"
#
export SPARK_CONF_DIR="/data/hadoop-config/command-home/engineplus-k8s-spark-3.0.0-online/conf"
export
SPARK_CONF_DIR
=
"/data/hadoop-config/command-home/engineplus-k8s-spark-3.0.0-online/conf"
spark-submit
--class
mobvista.dmp.datasource.retargeting.DeviceInfoJob
\
--name
"DeviceInfoJob.wangjf.
${
date
}
"
\
--conf
spark.sql.shuffle.partitions
=
6000
\
--conf
spark.default.parallelism
=
6000
\
--conf
spark.sql.broadcastTimeout
=
1200
\
--conf
spark.sql.shuffle.partitions
=
10000
\
--conf
spark.default.parallelism
=
10000
\
--conf
spark.kryoserializer.buffer.max
=
512m
\
--conf
spark.kryoserializer.buffer
=
64m
\
--conf
spark.sql.files.maxPartitionBytes
=
536870912
\
--conf
spark.sql.autoBroadcastJoinThreshold
=
-1
\
--conf
spark.sql.adaptive.enabled
=
true
\
--conf
spark.sql.adaptive.advisoryPartitionSizeInBytes
=
536870912
\
--master
yarn
--deploy-mode
cluster
--executor-memory
12g
--driver-memory
10g
--executor-cores
4
--num-executors
1
8
0
\
--master
yarn
--deploy-mode
cluster
--executor-memory
12g
--driver-memory
10g
--executor-cores
4
--num-executors
1
0
0
\
../
${
JAR
}
\
-date
${
date
}
-output
${
output_path
}
-coalesce
3000
...
...
src/main/scala/mobvista/dmp/datasource/age_gender/Logic.scala
View file @
14b970fb
package
mobvista.dmp.datasource.age_gender
import
java.math.BigDecimal
import
java.text.SimpleDateFormat
import
java.util
import
java.util.Random
import
java.util.regex.Pattern
import
com.alibaba.fastjson.
{
JSON
,
JSONObject
}
import
com.google.common.collect.Sets
import
mobvista.dmp.datasource.age.mapreduce.Util
...
...
@@ -16,13 +10,18 @@ import org.apache.spark.sql.Row
import
org.apache.spark.sql.types.
{
StringType
,
StructField
,
StructType
}
import
org.codehaus.jackson.map.ObjectMapper
import
java.math.BigDecimal
import
java.text.SimpleDateFormat
import
java.util
import
java.util.Random
import
java.util.regex.Pattern
import
scala.collection.JavaConverters._
/**
* @package: mobvista.dmp.datasource.age
* @author: wangjf
* @create: 2018-09-13 16:01
*
*/
* @package: mobvista.dmp.datasource.age
* @author: wangjf
* @create: 2018-09-13 16:01
*
*/
object
Logic
{
private
val
wellSplit
=
Pattern
.
compile
(
"#"
)
...
...
@@ -31,6 +30,7 @@ object Logic {
private
val
lineSplit
:
Pattern
=
Pattern
.
compile
(
"-"
)
private
val
`match`
:
Pattern
=
Pattern
.
compile
(
"^0*-0*-0*-0*-0*$"
)
private
val
regex
:
Pattern
=
Pattern
.
compile
(
"""^\d+$"""
)
private
val
ageRegex
:
Pattern
=
Pattern
.
compile
(
"""\d{4}$"""
)
private
val
matchingAgeSet
:
util.HashSet
[
String
]
=
Sets
.
newHashSet
(
""
,
"0"
,
"1970"
,
"GB"
,
"null"
,
"-"
)
private
val
matchingGenderSet
:
util.HashSet
[
String
]
=
Sets
.
newHashSet
(
"f"
,
"m"
)
...
...
@@ -516,7 +516,7 @@ object Logic {
}
def
check_birthday
(
now
:
Int
,
birthday
:
String
)
:
Boolean
=
{
StringUtils
.
isNotBlank
(
birthday
)
&&
!
matchingAgeSet
.
contains
(
birthday
)
&&
r
egex
.
matcher
(
birthday
).
matches
()
&&
(
now
-
Integer
.
parseInt
(
birthday
))
>
0
&&
StringUtils
.
isNotBlank
(
birthday
)
&&
!
matchingAgeSet
.
contains
(
birthday
)
&&
ageR
egex
.
matcher
(
birthday
).
matches
()
&&
(
now
-
Integer
.
parseInt
(
birthday
))
>
0
&&
(
now
-
Integer
.
parseInt
(
birthday
))
<
100
}
...
...
src/main/scala/mobvista/dmp/datasource/event_tag/Ga_purchase_event.scala
View file @
14b970fb
...
...
@@ -14,8 +14,8 @@ object Ga_purchase_event {
val
spark
=
SparkSession
.
builder
()
.
enableHiveSupport
()
.
getOrCreate
()
spark
.
conf
.
set
(
"spark.serializer"
,
"org.apache.spark.serializer.KryoSerializer"
);
spark
.
conf
.
set
(
"spark.kryoserializer.buffer.max"
,
"300m"
)
//
spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
//
spark.conf.set("spark.kryoserializer.buffer.max", "300m")
//yyyyMMdd
val
loadTime
=
spark
.
conf
.
get
(
"spark.app.loadTime"
)
var
year
=
loadTime
.
substring
(
0
,
4
)
...
...
@@ -25,7 +25,7 @@ object Ga_purchase_event {
val
table_name
=
spark
.
conf
.
get
(
"spark.app.table"
)
val
db_name
=
spark
.
conf
.
get
(
"spark.app.db_name"
)
val
outputPath
=
"s3://mob-emr-test/dataplatform/DataWareHouse/data/"
+
db_name
+
"/"
+
table_name
spark
.
conf
.
set
(
"spark.kryoserializer.buffer.max"
,
"300m"
)
//
spark.conf.set("spark.kryoserializer.buffer.max", "300m")
//***parquet**
spark
.
sparkContext
.
hadoopConfiguration
.
set
(
"mapreduce.fileoutputcommitter.algorithm.version"
,
"2"
)
spark
.
sparkContext
.
hadoopConfiguration
.
set
(
"yarn.nodemanager.pmem-check-enabled"
,
"false"
)
...
...
src/main/scala/mobvista/dmp/datasource/retargeting/DeviceInfoJob.scala
View file @
14b970fb
...
...
@@ -9,6 +9,7 @@ import mobvista.prd.datasource.util.GsonUtil
import
org.apache.commons.cli.
{
BasicParser
,
Options
}
import
org.apache.commons.lang3.StringUtils
import
org.apache.hadoop.fs.
{
FileSystem
,
Path
}
import
org.apache.spark.broadcast.Broadcast
import
org.apache.spark.sql.
{
SaveMode
,
SparkSession
}
import
java.net.URI
...
...
@@ -38,8 +39,8 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
val
sdf1
=
new
SimpleDateFormat
(
"yyyy-MM-dd"
)
val
sdf2
=
new
SimpleDateFormat
(
"yyyyMMdd"
)
var
bMap
:
scala.collection.Map
[
String
,
String
]
=
new
mutable
.
HashMap
[
String
,
String
]()
var
packageMap
:
scala.collection.Map
[
String
,
Int
]
=
new
mutable
.
HashMap
[
String
,
Int
]()
var
bMap
:
Broadcast
[
scala.collection.Map
[
String
,
String
]]
=
null
var
packageMap
:
Broadcast
[
scala.collection.Map
[
String
,
Int
]]
=
null
override
protected
def
run
(
args
:
Array
[
String
])
:
Int
=
{
val
parser
=
new
BasicParser
()
...
...
@@ -56,6 +57,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
.
config
(
"spark.sql.orc.filterPushdown"
,
"true"
)
.
config
(
"spark.io.compression.codec"
,
"lz4"
)
.
config
(
"spark.io.compression.lz4.blockSize"
,
"64k"
)
.
config
(
"spark.sql.autoBroadcastJoinThreshold"
,
"314572800"
)
.
config
(
"spark.sql.warehouse.dir"
,
"s3://mob-emr-test/spark-warehouse"
)
.
config
(
"spark.serializer"
,
"org.apache.spark.serializer.KryoSerializer"
)
.
enableHiveSupport
()
...
...
@@ -69,15 +71,17 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
val
sc
=
spark
.
sparkContext
val
code_sql
=
Constant
.
old2new_sql
bMap
=
spark
.
sql
(
code_sql
).
rdd
.
cache
()
.
map
(
r
=>
{
(
r
.
getAs
(
"tag_code"
).
toString
,
r
.
getAs
(
"new_second_id"
).
toString
)
}).
collectAsMap
()
bMap
=
sc
.
broadcast
(
spark
.
sql
(
code_sql
).
rdd
.
map
(
r
=>
{
(
r
.
getAs
(
"tag_code"
).
toString
,
r
.
getAs
(
"new_second_id"
).
toString
)
}).
collectAsMap
())
println
(
"bMap.size ===>>> "
+
bMap
.
value
.
size
)
val
map
=
spark
.
sql
(
Constant
.
second2first_sql
).
rdd
.
cache
()
.
map
(
r
=>
{
(
r
.
getAs
(
"new_second_id"
).
toString
,
r
.
getAs
(
"new_first_id"
).
toString
)
}).
collectAsMap
()
val
map
=
sc
.
broadcast
(
spark
.
sql
(
Constant
.
second2first_sql
).
rdd
.
map
(
r
=>
{
(
r
.
getAs
(
"new_second_id"
).
toString
,
r
.
getAs
(
"new_first_id"
).
toString
)
}).
collectAsMap
())
println
(
"map.size ===>>> "
+
map
.
value
.
size
)
var
package_sql
=
"""
...
...
@@ -89,10 +93,9 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
s
"""
|SELECT id, package_name FROM dwh.package_mapping WHERE dt = '${package_dt}'
"""
.
stripMargin
packageMap
=
spark
.
sql
(
package_sql
).
rdd
.
cache
()
.
map
(
r
=>
{
(
r
.
getAs
(
"package_name"
).
toString
.
toLowerCase
,
Integer
.
parseInt
(
r
.
getAs
(
"id"
).
toString
))
}).
collectAsMap
()
packageMap
=
spark
.
sparkContext
.
broadcast
(
spark
.
sql
(
package_sql
).
rdd
.
map
(
r
=>
{
(
r
.
getAs
(
"package_name"
).
toString
.
toLowerCase
,
Integer
.
parseInt
(
r
.
getAs
(
"id"
).
toString
))
}).
collectAsMap
())
/*
packageMap = sc.broadcast(Constant.jdbcConnection(spark, "mob_adn", "dmp_app_map").rdd.map(r => {
...
...
@@ -143,7 +146,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
new
JSONObject
()
}
freObject
.
keySet
().
foreach
(
key
=>
{
interest
.
add
(
map
(
key
))
interest
.
add
(
map
.
value
(
key
))
interest
.
add
(
key
)
})
/*
...
...
@@ -161,7 +164,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
*/
val
interestArr
=
r
.
getAs
(
"interest"
).
asInstanceOf
[
mutable.WrappedArray
[
String
]]
interestArr
.
foreach
(
i
=>
{
interest
.
add
(
map
(
i
))
interest
.
add
(
map
.
value
(
i
))
interest
.
add
(
i
)
})
...
...
@@ -177,7 +180,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
val
count
=
j
.
get
(
"count"
).
getAsInt
cntJson
.
put
(
"count"
,
count
)
tag_week_jsonObject
.
put
(
tag_id
,
cntJson
)
interest
.
add
(
map
(
tag_id
))
interest
.
add
(
map
.
value
(
tag_id
))
interest
.
add
(
tag_id
)
}
}
...
...
@@ -193,7 +196,7 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
val
count
=
j
.
get
(
"count"
).
getAsInt
cntJson
.
put
(
"count"
,
count
)
tag_month_jsonObject
.
put
(
tag_id
,
cntJson
)
interest
.
add
(
map
(
tag_id
))
interest
.
add
(
map
.
value
(
tag_id
))
interest
.
add
(
tag_id
)
}
}
...
...
@@ -224,10 +227,10 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
val
ins
=
inters
.
toUpperCase
.
split
(
","
)
if
(
ins
.
length
>=
3
)
{
val
key
=
ins
(
0
)
+
"-"
+
ins
(
1
)
+
"-"
+
ins
(
2
)
val
vals
=
if
(
bMap
.
keySet
.
contains
(
key
))
{
bMap
(
key
)
val
vals
=
if
(
bMap
.
value
.
keySet
.
contains
(
key
))
{
bMap
.
value
(
key
)
}
else
{
bMap
.
getOrElse
(
key
+
"OTHER"
,
""
)
bMap
.
value
.
getOrElse
(
key
+
"OTHER"
,
""
)
}
if
(
StringUtils
.
isNotBlank
(
vals
))
{
set
.
add
(
vals
)
...
...
@@ -238,10 +241,10 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
}
def
getId
(
tag_code
:
String
)
:
String
=
{
val
id
=
if
(
bMap
.
keySet
.
contains
(
tag_code
.
toUpperCase
))
{
bMap
(
tag_code
.
toUpperCase
)
val
id
=
if
(
bMap
.
value
.
keySet
.
contains
(
tag_code
.
toUpperCase
))
{
bMap
.
value
(
tag_code
.
toUpperCase
)
}
else
{
bMap
.
getOrElse
(
tag_code
.
toUpperCase
+
"OTHER"
,
""
)
bMap
.
value
.
getOrElse
(
tag_code
.
toUpperCase
+
"OTHER"
,
""
)
}
id
}
...
...
@@ -252,9 +255,9 @@ class DeviceInfoJob extends CommonSparkJob with Serializable {
if
(
StringUtils
.
isNotBlank
(
install
))
{
install
.
split
(
","
).
foreach
(
pkgs
=>
{
val
pkd
=
pkgs
.
split
(
"\\|"
)
if
(
pkd
.
nonEmpty
&&
StringUtils
.
isNotBlank
(
pkd
(
0
))
&&
packageMap
.
contains
(
pkd
(
0
).
toLowerCase
)
if
(
pkd
.
nonEmpty
&&
StringUtils
.
isNotBlank
(
pkd
(
0
))
&&
packageMap
.
value
.
contains
(
pkd
(
0
).
toLowerCase
)
)
{
set
.
add
(
packageMap
(
pkd
(
0
).
toLowerCase
))
set
.
add
(
packageMap
.
value
(
pkd
(
0
).
toLowerCase
))
}
})
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment