Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
mobvista-dmp
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
王金锋
mobvista-dmp
Commits
cc706c53
Commit
cc706c53
authored
Jul 20, 2021
by
WangJinfeng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update rtdmp logic,add device_type
parent
afb611c4
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
120 additions
and
65 deletions
+120
-65
mapping.sh
azkaban/backflow/mapping/mapping.sh
+1
-1
rtdmp_merge.sh
azkaban/rtdmp/rtdmp_merge.sh
+15
-15
rtdmp_request.sh
azkaban/rtdmp/rtdmp_request.sh
+1
-1
ServerMain.java
src/main/java/mobvista/dmp/datasource/rtdmp/ServerMain.java
+4
-0
Constant.scala
src/main/scala/mobvista/dmp/datasource/rtdmp/Constant.scala
+3
-3
CustomIteratorAudienceInfo.scala
...sta/dmp/datasource/rtdmp/CustomIteratorAudienceInfo.scala
+8
-8
CustomIteratorAudienceInfoV2.scala
...a/dmp/datasource/rtdmp/CustomIteratorAudienceInfoV2.scala
+1
-1
CustomMapPartition.scala
...la/mobvista/dmp/datasource/rtdmp/CustomMapPartition.scala
+2
-1
Logic.scala
src/main/scala/mobvista/dmp/datasource/rtdmp/Logic.scala
+14
-4
RTDmpMainPre.scala
...in/scala/mobvista/dmp/datasource/rtdmp/RTDmpMainPre.scala
+59
-25
RTDmpMainSpe.scala
...in/scala/mobvista/dmp/datasource/rtdmp/RTDmpMainSpe.scala
+1
-1
RTDmpMerge.scala
...main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMerge.scala
+9
-5
RTDmpMergeCK.scala
...in/scala/mobvista/dmp/datasource/rtdmp/RTDmpMergeCK.scala
+2
-0
No files found.
azkaban/backflow/mapping/mapping.sh
View file @
cc706c53
...
@@ -64,7 +64,7 @@ spark-submit --class mobvista.dmp.datasource.backflow.BackFlow \
...
@@ -64,7 +64,7 @@ spark-submit --class mobvista.dmp.datasource.backflow.BackFlow \
--conf
spark.kryoserializer.buffer.max
=
512m
\
--conf
spark.kryoserializer.buffer.max
=
512m
\
--conf
spark.kryoserializer.buffer
=
64m
\
--conf
spark.kryoserializer.buffer
=
64m
\
--master
yarn
--deploy-mode
cluster
\
--master
yarn
--deploy-mode
cluster
\
--executor-memory
4g
--driver-memory
4g
--executor-cores
2
--num-executors
4
\
--executor-memory
4g
--driver-memory
4g
--executor-cores
3
--num-executors
4
\
../.././DMP.jar
\
../.././DMP.jar
\
-keyspace
${
keyspace
}
-table
${
table
}
-region
${
region
}
-output
${
output
}
-system
${
system
}
\
-keyspace
${
keyspace
}
-table
${
table
}
-region
${
region
}
-output
${
output
}
-system
${
system
}
\
-writetime_start
${
writetime_start
}
-writetime_end
${
writetime_end
}
-value_column
${
value_column
}
-writetime_start
${
writetime_start
}
-writetime_end
${
writetime_end
}
-value_column
${
value_column
}
...
...
azkaban/rtdmp/rtdmp_merge.sh
View file @
cc706c53
...
@@ -4,33 +4,33 @@ source ../dmp_env.sh
...
@@ -4,33 +4,33 @@ source ../dmp_env.sh
today
=
${
ScheduleTime
:-
$1
}
today
=
${
ScheduleTime
:-
$1
}
old_time
=
$(
date
+
"%Y%m%d%H"
-d
"-
7
hour
$today
"
)
old_time
=
$(
date
+
"%Y%m%d%H"
-d
"-
2
hour
$today
"
)
curr_time
=
$(
date
+
"%Y%m%d%H"
-d
"-1 hour
$today
"
)
curr_time
=
$(
date
+
"%Y%m%d%H"
-d
"-1 hour
$today
"
)
old_date_path
=
$(
date
+%Y/%m/%d/%H
-d
"-7 hour
$today
"
)
old_date_path
=
$(
date
+%Y/%m/%d/%H
-d
"-2 hour
$today
"
)
echo
${
old_date_path
}
date_path
=
$(
date
+%Y/%m/%d/%H
-d
"-1 hour
$today
"
)
date_path
=
$(
date
+%Y/%m/%d/%H
-d
"-1 hour
$today
"
)
BASE_PATH
=
"s3://mob-emr-test/dataplatform/rtdmp_deal"
BASE_PATH
=
"s3://mob-emr-test/dataplatform/rtdmp_deal"
HOUR_1_DATE
=
$(
date
+%Y/%m/%d/%H
-d
"-1 hour
$today
"
)
HOUR_1_DATE
=
$(
date
+%Y/%m/%d/%H
-d
"-1 hour
$today
"
)
HOUR_2_DATE
=
$(
date
+%Y/%m/%d/%H
-d
"-2 hour
$today
"
)
# HOUR_2_DATE=$(date +%Y/%m/%d/%H -d "-2 hour $today")
HOUR_3_DATE
=
$(
date
+%Y/%m/%d/%H
-d
"-3 hour
$today
"
)
# HOUR_3_DATE=$(date +%Y/%m/%d/%H -d "-3 hour $today")
HOUR_4_DATE
=
$(
date
+%Y/%m/%d/%H
-d
"-4 hour
$today
"
)
# HOUR_4_DATE=$(date +%Y/%m/%d/%H -d "-4 hour $today")
HOUR_5_DATE
=
$(
date
+%Y/%m/%d/%H
-d
"-5 hour
$today
"
)
# HOUR_5_DATE=$(date +%Y/%m/%d/%H -d "-5 hour $today")
HOUR_6_DATE
=
$(
date
+%Y/%m/%d/%H
-d
"-6 hour
$today
"
)
# HOUR_6_DATE=$(date +%Y/%m/%d/%H -d "-6 hour $today")
# INPUT="${BASE_PATH}/${HOUR_1_DATE},${BASE_PATH}/${HOUR_2_DATE},${BASE_PATH}/${HOUR_3_DATE},${BASE_PATH}/${HOUR_4_DATE},${BASE_PATH}/${HOUR_5_DATE},${BASE_PATH}/${HOUR_6_DATE}"
INPUT
=
"
${
BASE_PATH
}
/
${
HOUR_1_DATE
}
,
${
BASE_PATH
}
/
${
HOUR_2_DATE
}
,
${
BASE_PATH
}
/
${
HOUR_3_DATE
}
,
${
BASE_PATH
}
/
${
HOUR_4_DATE
}
,
${
BASE_PATH
}
/
${
HOUR_5_DATE
}
,
${
BASE_PATH
}
/
${
HOUR_6_DATE
}
"
INPUT
=
"
${
BASE_PATH
}
/
${
HOUR_1_DATE
}
"
check_await
${
BASE_PATH
}
/
${
HOUR_1_DATE
}
/_SUCCESS
check_await
${
BASE_PATH
}
/
${
HOUR_1_DATE
}
/_SUCCESS
check_await
${
BASE_PATH
}
/
${
HOUR_2_DATE
}
/_SUCCESS
#
check_await ${BASE_PATH}/${HOUR_2_DATE}/_SUCCESS
check_await
${
BASE_PATH
}
/
${
HOUR_3_DATE
}
/_SUCCESS
#
check_await ${BASE_PATH}/${HOUR_3_DATE}/_SUCCESS
check_await
${
BASE_PATH
}
/
${
HOUR_4_DATE
}
/_SUCCESS
#
check_await ${BASE_PATH}/${HOUR_4_DATE}/_SUCCESS
check_await
${
BASE_PATH
}
/
${
HOUR_5_DATE
}
/_SUCCESS
#
check_await ${BASE_PATH}/${HOUR_5_DATE}/_SUCCESS
check_await
${
BASE_PATH
}
/
${
HOUR_6_DATE
}
/_SUCCESS
#
check_await ${BASE_PATH}/${HOUR_6_DATE}/_SUCCESS
MERGE_INPUT
=
"s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/audience_merge/
${
old_date_path
}
"
MERGE_INPUT
=
"s3://mob-emr-test/dataplatform/DataWareHouse/data/dwh/audience_merge/
${
old_date_path
}
"
...
...
azkaban/rtdmp/rtdmp_request.sh
View file @
cc706c53
...
@@ -119,7 +119,7 @@ if [[ $? -ne 0 ]]; then
...
@@ -119,7 +119,7 @@ if [[ $? -ne 0 ]]; then
exit
255
exit
255
fi
fi
expire_date_path
=
$(
date
+%Y/%m/%d
-d
"-
15
day
$today
"
)
expire_date_path
=
$(
date
+%Y/%m/%d
-d
"-
90
day
$today
"
)
EXPIRE_OUTPUT_PATH
=
"s3://mob-emr-test/dataplatform/rtdmp_request/
${
expire_date_path
}
"
EXPIRE_OUTPUT_PATH
=
"s3://mob-emr-test/dataplatform/rtdmp_request/
${
expire_date_path
}
"
if
hadoop fs
-ls
"
$EXPIRE_OUTPUT_PATH
"
>
/dev/null 2>&1
;
then
if
hadoop fs
-ls
"
$EXPIRE_OUTPUT_PATH
"
>
/dev/null 2>&1
;
then
hadoop dfs
-rm
-r
${
EXPIRE_OUTPUT_PATH
}
hadoop dfs
-rm
-r
${
EXPIRE_OUTPUT_PATH
}
...
...
src/main/java/mobvista/dmp/datasource/rtdmp/ServerMain.java
View file @
cc706c53
...
@@ -79,6 +79,8 @@ public class ServerMain {
...
@@ -79,6 +79,8 @@ public class ServerMain {
try
{
try
{
httpPost
.
setConfig
(
requestConfig
);
httpPost
.
setConfig
(
requestConfig
);
httpPost
.
setEntity
(
new
StringEntity
(
requestBody
.
toJSONString
()));
httpPost
.
setEntity
(
new
StringEntity
(
requestBody
.
toJSONString
()));
client
.
execute
(
httpPost
);
/*
response = client.execute(httpPost);
response = client.execute(httpPost);
BufferedReader rd = new BufferedReader(
BufferedReader rd = new BufferedReader(
new InputStreamReader(response.getEntity().getContent()));
new InputStreamReader(response.getEntity().getContent()));
...
@@ -91,6 +93,8 @@ public class ServerMain {
...
@@ -91,6 +93,8 @@ public class ServerMain {
if (jsonObject.getInteger("code") == 200 && jsonObject.containsKey("data")) {
if (jsonObject.getInteger("code") == 200 && jsonObject.containsKey("data")) {
logger.info("Audience data fetch success!");
logger.info("Audience data fetch success!");
}
}
*/
logger
.
info
(
"Audience data fetch success!"
);
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
logger
.
info
(
e
.
getMessage
());
logger
.
info
(
e
.
getMessage
());
}
finally
{
}
finally
{
...
...
src/main/scala/mobvista/dmp/datasource/rtdmp/Constant.scala
View file @
cc706c53
...
@@ -14,11 +14,11 @@ object Constant {
...
@@ -14,11 +14,11 @@ object Constant {
case
class
Device
(
device_id
:
String
)
case
class
Device
(
device_id
:
String
)
case
class
AudienceInfoPre
(
devid
:
String
,
audience_ids
:
String
)
case
class
AudienceInfoPre
(
devid
:
String
,
audience_ids
:
String
,
device_type
:
String
)
case
class
AudienceInfo
(
devid
:
String
,
audience_data
:
String
,
old_audience_data
:
String
)
case
class
AudienceInfo
(
devid
:
String
,
audience_data
:
String
,
old_audience_data
:
String
)
case
class
NewAudienceInfo
(
devid
:
String
,
update_time
:
String
,
audience_data
:
String
)
case
class
NewAudienceInfo
(
devid
:
String
,
update_time
:
String
,
audience_data
:
String
,
device_type
:
String
)
case
class
AudienceRegion
(
audience_info
:
String
,
region
:
mutable.WrappedArray
[
String
])
case
class
AudienceRegion
(
audience_info
:
String
,
region
:
mutable.WrappedArray
[
String
])
...
@@ -26,7 +26,7 @@ object Constant {
...
@@ -26,7 +26,7 @@ object Constant {
// case class AudienceMerge(devid: String, audience_id: mutable.WrappedArray[Int], update_time: String) extends Serializable
// case class AudienceMerge(devid: String, audience_id: mutable.WrappedArray[Int], update_time: String) extends Serializable
case
class
AudienceMerge
(
devid
:
String
,
audience_map
:
String
,
update_time
:
String
)
extends
Serializable
case
class
AudienceMerge
(
devid
:
String
,
audience_map
:
String
,
update_time
:
String
,
device_type
:
String
)
extends
Serializable
val
read_sql
=
val
read_sql
=
"""
"""
...
...
src/main/scala/mobvista/dmp/datasource/rtdmp/CustomIteratorAudienceInfo.scala
View file @
cc706c53
...
@@ -4,13 +4,13 @@ import mobvista.dmp.common.MobvistaConstant
...
@@ -4,13 +4,13 @@ import mobvista.dmp.common.MobvistaConstant
import
mobvista.dmp.datasource.rtdmp.Constant.
{
AudienceInfo
,
NewAudienceInfo
}
import
mobvista.dmp.datasource.rtdmp.Constant.
{
AudienceInfo
,
NewAudienceInfo
}
/**
/**
* @package: mobvista.dmp.datasource.rtdmp
* @package: mobvista.dmp.datasource.rtdmp
* @author: wangjf
* @author: wangjf
* @date: 2020/8/4
* @date: 2020/8/4
* @time: 4:00 下午
* @time: 4:00 下午
* @email: jinfeng.wang@mobvista.com
* @email: jinfeng.wang@mobvista.com
* @phone: 152-1062-7698
* @phone: 152-1062-7698
*/
*/
class
CustomIteratorAudienceInfo
(
audinceInfos
:
Iterator
[
AudienceInfo
],
update_time
:
String
,
expire_time
:
String
,
set
:
java.util.Set
[
Integer
])
extends
Iterator
[
NewAudienceInfo
]
{
class
CustomIteratorAudienceInfo
(
audinceInfos
:
Iterator
[
AudienceInfo
],
update_time
:
String
,
expire_time
:
String
,
set
:
java.util.Set
[
Integer
])
extends
Iterator
[
NewAudienceInfo
]
{
override
def
hasNext
:
Boolean
=
{
override
def
hasNext
:
Boolean
=
{
audinceInfos
.
hasNext
audinceInfos
.
hasNext
...
@@ -33,6 +33,6 @@ class CustomIteratorAudienceInfo(audinceInfos: Iterator[AudienceInfo], update_ti
...
@@ -33,6 +33,6 @@ class CustomIteratorAudienceInfo(audinceInfos: Iterator[AudienceInfo], update_ti
old_json
.
putAll
(
new_json
)
old_json
.
putAll
(
new_json
)
NewAudienceInfo
(
devId
,
update_time
,
old_json
.
toString
)
NewAudienceInfo
(
devId
,
update_time
,
old_json
.
toString
,
""
)
}
}
}
}
src/main/scala/mobvista/dmp/datasource/rtdmp/CustomIteratorAudienceInfoV2.scala
View file @
cc706c53
...
@@ -43,6 +43,6 @@ class CustomIteratorAudienceInfoV2(audinceInfos: Iterator[AudienceInfo], update_
...
@@ -43,6 +43,6 @@ class CustomIteratorAudienceInfoV2(audinceInfos: Iterator[AudienceInfo], update_
).
asJava
).
asJava
old_json
.
putAll
(
new_json
.
asInstanceOf
[
java.util.Map
[
String
,
String
]])
old_json
.
putAll
(
new_json
.
asInstanceOf
[
java.util.Map
[
String
,
String
]])
NewAudienceInfo
(
devId
,
update_time
,
old_json
.
toString
)
NewAudienceInfo
(
devId
,
update_time
,
old_json
.
toString
,
""
)
}
}
}
}
src/main/scala/mobvista/dmp/datasource/rtdmp/CustomMapPartition.scala
View file @
cc706c53
...
@@ -27,6 +27,7 @@ class CustomMapPartition(rows: Iterator[Row], update_time: String, expire_time:
...
@@ -27,6 +27,7 @@ class CustomMapPartition(rows: Iterator[Row], update_time: String, expire_time:
val
devId
=
row
.
getAs
[
String
](
"devid"
)
val
devId
=
row
.
getAs
[
String
](
"devid"
)
val
audience_data
=
row
.
getAs
[
String
](
"audience_ids"
)
val
audience_data
=
row
.
getAs
[
String
](
"audience_ids"
)
val
old_audience_data
=
row
.
getAs
[
String
](
"audience_data"
)
val
old_audience_data
=
row
.
getAs
[
String
](
"audience_data"
)
val
device_type
=
row
.
getAs
[
String
](
"device_type"
)
val
new_json
=
new
JSONObject
()
val
new_json
=
new
JSONObject
()
audience_data
.
split
(
","
,
-
1
).
foreach
(
k
=>
{
audience_data
.
split
(
","
,
-
1
).
foreach
(
k
=>
{
if
(
Integer
.
parseInt
(
k
)
>
0
)
{
if
(
Integer
.
parseInt
(
k
)
>
0
)
{
...
@@ -43,6 +44,6 @@ class CustomMapPartition(rows: Iterator[Row], update_time: String, expire_time:
...
@@ -43,6 +44,6 @@ class CustomMapPartition(rows: Iterator[Row], update_time: String, expire_time:
).
asJava
).
asJava
old_json
.
putAll
(
new_json
.
asInstanceOf
[
java.util.Map
[
String
,
String
]])
old_json
.
putAll
(
new_json
.
asInstanceOf
[
java.util.Map
[
String
,
String
]])
NewAudienceInfo
(
devId
,
update_time
,
old_json
.
toString
)
NewAudienceInfo
(
devId
,
update_time
,
old_json
.
toString
,
device_type
)
}
}
}
}
src/main/scala/mobvista/dmp/datasource/rtdmp/Logic.scala
View file @
cc706c53
package
mobvista.dmp.datasource.rtdmp
package
mobvista.dmp.datasource.rtdmp
import
java.util.StringJoiner
import
com.alibaba.fastjson.
{
JSON
,
JSONArray
,
JSONObject
}
import
com.alibaba.fastjson.
{
JSON
,
JSONArray
,
JSONObject
}
import
com.datastax.oss.driver.api.core.CqlSession
import
com.datastax.oss.driver.api.core.CqlSession
import
com.datastax.oss.driver.api.core.cql.ResultSet
import
com.datastax.oss.driver.api.core.cql.ResultSet
...
@@ -15,6 +13,7 @@ import org.apache.hadoop.io.Text
...
@@ -15,6 +13,7 @@ import org.apache.hadoop.io.Text
import
org.apache.spark.sql.Row
import
org.apache.spark.sql.Row
import
org.apache.spark.sql.types.
{
StringType
,
StructField
,
StructType
}
import
org.apache.spark.sql.types.
{
StringType
,
StructField
,
StructType
}
import
java.util.StringJoiner
import
scala.collection.JavaConversions._
import
scala.collection.JavaConversions._
import
scala.collection.JavaConverters._
import
scala.collection.JavaConverters._
import
scala.collection.mutable.ArrayBuffer
import
scala.collection.mutable.ArrayBuffer
...
@@ -78,7 +77,7 @@ object Logic {
...
@@ -78,7 +77,7 @@ object Logic {
old_json
.
put
(
audienceId
,
new_json
.
getString
(
audienceId
))
old_json
.
put
(
audienceId
,
new_json
.
getString
(
audienceId
))
}
}
})
})
array
.
add
(
Constant
.
NewAudienceInfo
(
devId
,
""
,
old_json
.
toJSONString
))
array
.
add
(
Constant
.
NewAudienceInfo
(
devId
,
""
,
old_json
.
toJSONString
,
""
))
})
})
array
.
iterator
array
.
iterator
}
}
...
@@ -601,6 +600,17 @@ object Logic {
...
@@ -601,6 +600,17 @@ object Logic {
old_json
.
put
(
audienceId
,
new_json
.
getString
(
audienceId
))
old_json
.
put
(
audienceId
,
new_json
.
getString
(
audienceId
))
}
}
})
})
NewAudienceInfo
(
devId
,
""
,
old_json
.
toJSONString
)
NewAudienceInfo
(
devId
,
""
,
old_json
.
toJSONString
,
""
)
}
def
getDevType
(
device_type
:
String
)
:
String
=
{
device_type
match
{
case
"imeimd5"
|
"imei_md5"
=>
"imei_md5"
case
"gaidmd5"
|
"gaid_md5"
=>
"gaid_md5"
case
"oaidmd5"
|
"oaid_md5"
=>
"oaid_md5"
case
"idfamd5"
|
"idfa_md5"
=>
"idfa_md5"
case
"androididmd5"
|
"androidid_md5"
=>
"androidid_md5"
case
_
=>
""
}
}
}
}
}
src/main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMainPre.scala
View file @
cc706c53
...
@@ -3,10 +3,14 @@ package mobvista.dmp.datasource.rtdmp
...
@@ -3,10 +3,14 @@ package mobvista.dmp.datasource.rtdmp
import
com.alibaba.fastjson.
{
JSONArray
,
JSONObject
}
import
com.alibaba.fastjson.
{
JSONArray
,
JSONObject
}
import
mobvista.dmp.common.
{
CommonSparkJob
,
MobvistaConstant
}
import
mobvista.dmp.common.
{
CommonSparkJob
,
MobvistaConstant
}
import
mobvista.dmp.datasource.rtdmp.Constant.AudienceInfoPre
import
mobvista.dmp.datasource.rtdmp.Constant.AudienceInfoPre
import
mobvista.dmp.datasource.rtdmp.Logic.getDevType
import
mobvista.dmp.util.
{
DateUtil
,
MD5Util
}
import
mobvista.dmp.util.
{
DateUtil
,
MD5Util
}
import
org.apache.commons.cli.
{
BasicParser
,
Options
}
import
org.apache.commons.cli.
{
BasicParser
,
Options
}
import
org.apache.commons.lang.StringUtils
import
org.apache.commons.lang.StringUtils
import
org.apache.hadoop.fs.
{
FileSystem
,
Path
}
import
org.apache.hadoop.fs.
{
FileSystem
,
Path
}
import
org.apache.hadoop.io.
{
LongWritable
,
Text
}
import
org.apache.hadoop.mapreduce.lib.input.
{
FileSplit
,
TextInputFormat
}
import
org.apache.spark.rdd.NewHadoopRDD
import
org.apache.spark.sql.
{
SaveMode
,
SparkSession
}
import
org.apache.spark.sql.
{
SaveMode
,
SparkSession
}
import
org.slf4j.LoggerFactory
import
org.slf4j.LoggerFactory
...
@@ -56,7 +60,7 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
...
@@ -56,7 +60,7 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
val
sc
=
spark
.
sparkContext
val
sc
=
spark
.
sparkContext
try
{
try
{
var
mergeRDD
=
sc
.
emptyRDD
[(
String
,
Int
)]
var
mergeRDD
=
sc
.
emptyRDD
[(
String
,
(
Int
,
String
)
)]
// 默认计算上个小时的数据
// 默认计算上个小时的数据
val
update_time_start
=
DateUtil
.
format
(
time
+
":00:00"
,
"yyyy-MM-dd HH:mm:ss"
)
val
update_time_start
=
DateUtil
.
format
(
time
+
":00:00"
,
"yyyy-MM-dd HH:mm:ss"
)
...
@@ -68,6 +72,10 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
...
@@ -68,6 +72,10 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
val
map
:
util.Map
[
Integer
,
(
JSONArray
,
Integer
,
Integer
,
JSONObject
)]
=
val
map
:
util.Map
[
Integer
,
(
JSONArray
,
Integer
,
Integer
,
JSONObject
)]
=
ServerUtil
.
request
(
update_time_start
,
update_time_end
,
audience_date_utime_start
,
audience_date_utime_end
,
0
,
1
,
2
)
ServerUtil
.
request
(
update_time_start
,
update_time_end
,
audience_date_utime_start
,
audience_date_utime_end
,
0
,
1
,
2
)
val
fc
=
classOf
[
TextInputFormat
]
val
kc
=
classOf
[
LongWritable
]
val
vc
=
classOf
[
Text
]
map
.
foreach
(
t
=>
{
map
.
foreach
(
t
=>
{
val
audienceId
=
Integer2int
(
t
.
_1
)
val
audienceId
=
Integer2int
(
t
.
_1
)
val
audienceOp
=
t
.
_2
.
_2
val
audienceOp
=
t
.
_2
.
_2
...
@@ -91,21 +99,31 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
...
@@ -91,21 +99,31 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
val
pathUri
=
new
URI
(
list
.
get
(
0
).
_1
)
val
pathUri
=
new
URI
(
list
.
get
(
0
).
_1
)
val
newAudience
=
if
(
FileSystem
.
get
(
new
URI
(
s
"${pathUri.getScheme}://${pathUri.getHost}"
),
sc
.
hadoopConfiguration
)
val
newAudience
=
if
(
FileSystem
.
get
(
new
URI
(
s
"${pathUri.getScheme}://${pathUri.getHost}"
),
sc
.
hadoopConfiguration
)
.
exists
(
new
Path
(
pathUri
.
toString
.
replace
(
"*"
,
""
))))
{
.
exists
(
new
Path
(
pathUri
.
toString
.
replace
(
"*"
,
""
))))
{
sc
.
textFile
(
list
.
get
(
0
).
_1
).
repartition
(
100
)
val
rdd
=
sc
.
newAPIHadoopFile
(
list
.
get
(
0
).
_1
,
fc
,
kc
,
vc
,
sc
.
hadoopConfiguration
)
.
filter
(
r
=>
{
val
linesWithFileNames
=
rdd
.
asInstanceOf
[
NewHadoopRDD
[
LongWritable
,
Text
]]
r
.
length
<=
64
.
mapPartitionsWithInputSplit
((
inputSplit
,
iterator
)
=>
{
})
val
file
=
inputSplit
.
asInstanceOf
[
FileSplit
]
.
map
(
r
=>
{
// First 为最新的人群包
iterator
.
map
(
tup
=>
(
file
.
getPath
.
getParent
.
getName
,
tup
.
_2
))
val
device_id
=
if
(
r
.
matches
(
MobvistaConstant
.
md5Ptn
))
{
r
}
else
{
MD5Util
.
getMD5Str
(
r
)
}
(
device_id
,
audienceId
)
})
})
linesWithFileNames
.
repartition
(
100
)
.
filter
(
r
=>
{
r
.
_2
.
toString
.
length
<=
64
}).
map
(
r
=>
{
// First 为最新的人群包
val
device_id
=
if
(
r
.
_2
.
toString
.
matches
(
MobvistaConstant
.
md5Ptn
))
{
r
.
_2
.
toString
}
else
{
MD5Util
.
getMD5Str
(
r
.
_2
.
toString
)
}
val
device_type
=
if
(
r
.
_1
.
endsWith
(
"md5"
))
{
r
.
_1
}
else
{
s
"${r._1}_md5"
}
(
device_id
,
(
audienceId
,
getDevType
(
device_type
)))
})
}
else
{
}
else
{
sc
.
emptyRDD
[(
String
,
Int
)]
sc
.
emptyRDD
[(
String
,
(
Int
,
String
)
)]
}
}
/*
/*
val oldAudience = sc.textFile(list.get(1)._1).repartition(100).map(r => { // Second 为旧的人群包,表示上一版本人群包
val oldAudience = sc.textFile(list.get(1)._1).repartition(100).map(r => { // Second 为旧的人群包,表示上一版本人群包
...
@@ -135,24 +153,36 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
...
@@ -135,24 +153,36 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
val
pathUri
=
new
URI
(
audData
.
get
(
0
).
_1
)
val
pathUri
=
new
URI
(
audData
.
get
(
0
).
_1
)
if
(
audData
.
nonEmpty
&&
FileSystem
.
get
(
new
URI
(
s
"${pathUri.getScheme}://${pathUri.getHost}"
),
sc
.
hadoopConfiguration
)
if
(
audData
.
nonEmpty
&&
FileSystem
.
get
(
new
URI
(
s
"${pathUri.getScheme}://${pathUri.getHost}"
),
sc
.
hadoopConfiguration
)
.
exists
(
new
Path
(
audData
.
get
(
0
).
_1
.
replace
(
"*"
,
""
))))
{
.
exists
(
new
Path
(
audData
.
get
(
0
).
_1
.
replace
(
"*"
,
""
))))
{
sc
.
textFile
(
audData
.
get
(
0
).
_1
)
val
rdd
=
sc
.
newAPIHadoopFile
(
audData
.
get
(
0
).
_1
,
fc
,
kc
,
vc
,
sc
.
hadoopConfiguration
)
val
linesWithFileNames
=
rdd
.
asInstanceOf
[
NewHadoopRDD
[
LongWritable
,
Text
]]
.
mapPartitionsWithInputSplit
((
inputSplit
,
iterator
)
=>
{
val
file
=
inputSplit
.
asInstanceOf
[
FileSplit
]
iterator
.
map
(
tup
=>
(
file
.
getPath
.
getParent
.
getName
,
tup
.
_2
))
})
linesWithFileNames
.
filter
(
r
=>
{
.
filter
(
r
=>
{
r
.
length
<=
64
r
.
_2
.
toString
.
length
<=
64
})
})
.
map
(
r
=>
{
// 取出最新的人群包
.
map
(
r
=>
{
// 取出最新的人群包
val
device_id
=
val
device_id
=
if
(
r
.
matches
(
MobvistaConstant
.
md5Ptn
))
{
if
(
r
.
_2
.
toString
.
matches
(
MobvistaConstant
.
md5Ptn
))
{
r
r
.
_2
.
toString
}
else
{
}
else
{
MD5Util
.
getMD5Str
(
r
)
MD5Util
.
getMD5Str
(
r
.
_2
.
toString
)
}
}
(
device_id
,
audienceId
)
val
device_type
=
if
(
r
.
_1
.
endsWith
(
"md5"
))
{
r
.
_1
}
else
{
s
"${r._1}_md5"
}
(
device_id
,
(
audienceId
,
getDevType
(
device_type
)))
})
})
}
else
{
// 如果没有,则创建 空RDD
}
else
{
// 如果没有,则创建 空RDD
sc
.
emptyRDD
[(
String
,
Int
)]
sc
.
emptyRDD
[(
String
,
(
Int
,
String
)
)]
}
}
}
else
{
}
else
{
sc
.
emptyRDD
[(
String
,
Int
)]
sc
.
emptyRDD
[(
String
,
(
Int
,
String
)
)]
}
}
// 所有人群包进行合并操作
// 所有人群包进行合并操作
mergeRDD
=
mergeRDD
.
union
(
updateRDD
)
mergeRDD
=
mergeRDD
.
union
(
updateRDD
)
...
@@ -160,19 +190,23 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
...
@@ -160,19 +190,23 @@ class RTDmpMainPre extends CommonSparkJob with Serializable {
val
df
=
mergeRDD
.
repartition
(
1000
).
groupByKey
().
map
(
r
=>
{
val
df
=
mergeRDD
.
repartition
(
1000
).
groupByKey
().
map
(
r
=>
{
val
devId
=
r
.
_1
val
devId
=
r
.
_1
var
deviceType
=
""
val
set
=
new
mutable
.
HashSet
[
Int
]()
val
set
=
new
mutable
.
HashSet
[
Int
]()
// 生成 audienceId -> update_date JSONObject
// 生成 audienceId -> update_date JSONObject
r
.
_2
.
foreach
(
t
=>
{
r
.
_2
.
foreach
(
t
=>
{
set
.
add
(
t
)
set
.
add
(
t
.
_1
)
if
(
StringUtils
.
isBlank
(
deviceType
))
{
deviceType
=
t
.
_2
}
})
})
(
devId
,
set
)
(
devId
,
set
,
deviceType
)
})
})
import
spark.implicits._
import
spark.implicits._
FileSystem
.
get
(
new
URI
(
s
"s3://mob-emr-test"
),
sc
.
hadoopConfiguration
).
delete
(
new
Path
(
output
),
true
)
FileSystem
.
get
(
new
URI
(
s
"s3://mob-emr-test"
),
sc
.
hadoopConfiguration
).
delete
(
new
Path
(
output
),
true
)
df
.
map
(
r
=>
{
df
.
map
(
r
=>
{
AudienceInfoPre
(
r
.
_1
,
r
.
_2
.
mkString
(
","
))
AudienceInfoPre
(
r
.
_1
,
r
.
_2
.
mkString
(
","
)
,
r
.
_3
)
}).
repartition
(
coalesce
.
toInt
)
}).
repartition
(
coalesce
.
toInt
)
.
toDF
.
toDF
.
write
.
write
...
...
src/main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMainSpe.scala
View file @
cc706c53
...
@@ -112,7 +112,7 @@ class RTDmpMainSpe extends CommonSparkJob with Serializable {
...
@@ -112,7 +112,7 @@ class RTDmpMainSpe extends CommonSparkJob with Serializable {
.
asScala
.
keySet
.
asScala
.
keySet
val
df
=
selectDF
.
join
(
cassandraDF
,
Seq
(
"devid"
),
"leftouter"
)
val
df
=
selectDF
.
join
(
cassandraDF
,
Seq
(
"devid"
),
"leftouter"
)
.
select
(
"devid"
,
"audience_ids"
,
"audience_data"
)
.
select
(
"devid"
,
"audience_ids"
,
"audience_data"
,
"device_type"
)
.
rdd
.
rdd
.
mapPartitions
(
v
=>
new
CustomMapPartition
(
v
,
update_time
=
time
,
expire_time
,
update_ids
))
.
mapPartitions
(
v
=>
new
CustomMapPartition
(
v
,
update_time
=
time
,
expire_time
,
update_ids
))
...
...
src/main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMerge.scala
View file @
cc706c53
...
@@ -58,10 +58,13 @@ class RTDmpMerge extends CommonSparkJob with Serializable {
...
@@ -58,10 +58,13 @@ class RTDmpMerge extends CommonSparkJob with Serializable {
val
mapper
=
new
ObjectMapper
()
val
mapper
=
new
ObjectMapper
()
try
{
try
{
import
spark.implicits._
import
spark.implicits._
val
paths
=
input
.
split
(
","
,
-
1
)
// val paths = input.split(",", -1)
val
daily_df
=
spark
.
read
.
orc
(
paths
(
0
),
paths
(
1
),
paths
(
2
),
paths
(
3
),
paths
(
4
),
paths
(
5
)).
rdd
.
map
(
row
=>
{
// spark.read.orc(paths(0), paths(1), paths(2), paths(3), paths(4), paths(5))
val
daily_df
=
spark
.
read
.
orc
(
input
).
rdd
.
map
(
row
=>
{
val
deviceId
=
row
.
getAs
[
String
](
"devid"
)
val
deviceId
=
row
.
getAs
[
String
](
"devid"
)
val
deviceType
=
row
.
getAs
[
String
](
"device_type"
)
val
audienceMap
=
JSON
.
parseObject
(
row
.
getAs
[
String
](
"audience_data"
)).
asInstanceOf
[
java.util.Map
[
String
,
String
]]
val
audienceMap
=
JSON
.
parseObject
(
row
.
getAs
[
String
](
"audience_data"
)).
asInstanceOf
[
java.util.Map
[
String
,
String
]]
.
map
(
kv
=>
{
.
map
(
kv
=>
{
val
audienceTime
=
DateUtil
.
format
(
DateUtil
.
parse
(
kv
.
_2
+
":00:00"
,
"yyyy-MM-dd HH:mm:ss"
),
"yyyyMMddHH"
)
val
audienceTime
=
DateUtil
.
format
(
DateUtil
.
parse
(
kv
.
_2
+
":00:00"
,
"yyyy-MM-dd HH:mm:ss"
),
"yyyyMMddHH"
)
...
@@ -72,7 +75,7 @@ class RTDmpMerge extends CommonSparkJob with Serializable {
...
@@ -72,7 +75,7 @@ class RTDmpMerge extends CommonSparkJob with Serializable {
audienceMap
.
keys
.
foreach
(
k
=>
{
audienceMap
.
keys
.
foreach
(
k
=>
{
updateAudienceIdSet
.
add
(
Integer
.
valueOf
(
k
))
updateAudienceIdSet
.
add
(
Integer
.
valueOf
(
k
))
})
})
AudienceMerge
(
deviceId
,
mapper
.
writeValueAsString
(
audienceMap
.
asJava
),
date_time
)
AudienceMerge
(
deviceId
,
mapper
.
writeValueAsString
(
audienceMap
.
asJava
),
date_time
,
deviceType
)
}).
toDF
}).
toDF
.
dropDuplicates
()
.
dropDuplicates
()
...
@@ -106,11 +109,12 @@ class RTDmpMerge extends CommonSparkJob with Serializable {
...
@@ -106,11 +109,12 @@ class RTDmpMerge extends CommonSparkJob with Serializable {
|SELECT
|SELECT
| coalesce(d.devid, m.devid) devid,
| coalesce(d.devid, m.devid) devid,
| coalesce(d.audience_map, m.audience_map) audience_map,
| coalesce(d.audience_map, m.audience_map) audience_map,
| coalesce(d.update_time, m.update_time) update_time
| coalesce(d.update_time, m.update_time) update_time,
| coalesce(d.device_type, m.device_type) device_type
| FROM daily_rtdmp d
| FROM daily_rtdmp d
| FULL OUTER JOIN
| FULL OUTER JOIN
| (SELECT devid, process(audience_map) audience_map,
| (SELECT devid, process(audience_map) audience_map,
| update_time
| update_time
, device_type
| FROM dwh.audience_merge
| FROM dwh.audience_merge
| WHERE dt = '@dt' AND update_time >= '@update_time'
| WHERE dt = '@dt' AND update_time >= '@update_time'
| ) m
| ) m
...
...
src/main/scala/mobvista/dmp/datasource/rtdmp/RTDmpMergeCK.scala
View file @
cc706c53
...
@@ -6,6 +6,7 @@ import mobvista.dmp.util.DateUtil
...
@@ -6,6 +6,7 @@ import mobvista.dmp.util.DateUtil
import
mobvista.dmp.utils.clickhouse.ClickHouseConnectionFactory
import
mobvista.dmp.utils.clickhouse.ClickHouseConnectionFactory
import
mobvista.dmp.utils.clickhouse.ClickHouseSparkExt._
import
mobvista.dmp.utils.clickhouse.ClickHouseSparkExt._
import
org.apache.commons.cli.
{
BasicParser
,
Options
}
import
org.apache.commons.cli.
{
BasicParser
,
Options
}
import
org.apache.spark.sql.functions.lit
import
ru.yandex.clickhouse.ClickHouseDataSource
import
ru.yandex.clickhouse.ClickHouseDataSource
import
scala.collection.JavaConversions._
import
scala.collection.JavaConversions._
...
@@ -51,6 +52,7 @@ class RTDmpMergeCK extends CommonSparkJob with Serializable {
...
@@ -51,6 +52,7 @@ class RTDmpMergeCK extends CommonSparkJob with Serializable {
spark
.
udf
.
register
(
"process"
,
process
_
)
spark
.
udf
.
register
(
"process"
,
process
_
)
val
df
=
spark
.
sql
(
sql
.
replace
(
"@dt"
,
date_time
))
val
df
=
spark
.
sql
(
sql
.
replace
(
"@dt"
,
date_time
))
.
withColumn
(
"device_type"
,
lit
(
""
))
implicit
val
clickhouseDataSource
:
ClickHouseDataSource
=
ClickHouseConnectionFactory
.
get
(
host
)
implicit
val
clickhouseDataSource
:
ClickHouseDataSource
=
ClickHouseConnectionFactory
.
get
(
host
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment