Commit 22455beb by mengdongxing

Add new file

parent 6b7111c4
# BootStarpMedian function
BootStarpMedian <- function(x) {
x <- as.vector(x)
n <- length(x)
med_vec <- replicate(n=1000,median(base::sample(x=x,size=n,replace=T)))
return(mean(med_vec))
}
#----------------------
# app std
#----------------------
# loading data
app_query_sql <- "select appid,
category_id,
isgame ,
ds,
sum(num_install) / sum(dupnum_click_all) as ins_rate
from tkdm.tkdm_data_active_detial_day
where ds between '2017-02-20' and '2017-02-26'
group by appid,category_id,isgame,ds
"
# app_query_result
app_query_result <- sql(app_query_sql)
#
app_schema <- structType(structField("category_id_app", "integer"),
structField("isgame_app", "integer"),
structField("ins_rate_std", "double") )
# aggregate
app <- gapply(x=app_query_result,
cols=c('category_id','isgame'),
function(key,x){
data.frame(key,BootStarpMedian(x$ins_rate),stringsAsFactors = FALSE )
},
schema=app_schema)
# collect(yy)
app_output <- join(x=app_query_result,
y=app,
joinExpr=app_query_result$category_id==app$category_id_app & app_query_result$isgame==app$isgame_app,
joinType='inner'
)
app_output <- select(app_output,'appid','category_id','isgame','ins_rate_std','ds')
#----------------------
# cid std
#----------------------
# loading data
cid_query_sql <- "select appid,
cid,
category_id,
isgame ,
ins_rate
from tkdm.tkdm_data_active_detial_day
where ds between '2017-02-20' and '2017-02-26'
"
# result of query sql
aa <- sql(cid_query_sql)
# output schema
cid_schema <- structType(structField("category_id_cid", "integer"),
structField("isgame_cid", "integer"),
structField("cid_cid", "integer"),
structField("ins_rate_std", "double") )
# aggregate
cid <- gapply(x=aa,
cols=c('category_id','isgame','cid'),
function(key,x){
data.frame(key,BootStarpMedian(x$ins_rate),stringsAsFactors = FALSE )
},
schema=cid_schema)
# collect(cid)
cid_output <- join(x=aa,y=cid,
joinExpr=aa$category_id==cid$category_id_cid & aa$isgame==cid$isgame_cid & aa$cid==cid$cid_cid,
joinType='inner'
)
# cid_output <- cid_output[,c('appid','cid','category_id','isgame','ins_rate_std')]
cid_output <- select(cid_output,'appid','cid','category_id','isgame','ins_rate_std','ds')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment