BootStarpMedian.R 2.65 KB
# BootStarpMedian function 
BootStarpMedian <- function(x) {
  x <- as.vector(x)
  n <- length(x)
  med_vec <- replicate(n=1000,median(base::sample(x=x,size=n,replace=T)))
  return(mean(med_vec))
}

#----------------------
# app std
#----------------------
# loading data 
app_query_sql <- "select appid,
                         category_id,
                         isgame ,
                         ds,
                         sum(num_install) / sum(dupnum_click_all) as ins_rate
                   from tkdm.tkdm_data_active_detial_day
                   where ds between '2017-02-20' and '2017-02-26'
                   group by appid,category_id,isgame,ds
"
# app_query_result
app_query_result <- sql(app_query_sql)
# 
app_schema <-  structType(structField("category_id_app", "integer"), 
                      structField("isgame_app", "integer"),
                      structField("ins_rate_std", "double") )
# aggregate
app <- gapply(x=app_query_result,
         cols=c('category_id','isgame'),
         function(key,x){
              data.frame(key,BootStarpMedian(x$ins_rate),stringsAsFactors = FALSE )
              },
         schema=app_schema)
# collect(yy)
app_output <- join(x=app_query_result,
                   y=app,
                   joinExpr=app_query_result$category_id==app$category_id_app & app_query_result$isgame==app$isgame_app,
                   joinType='inner'
                   )
app_output <- select(app_output,'appid','category_id','isgame','ins_rate_std','ds')

#----------------------
# cid std
#----------------------

# loading data 
cid_query_sql <- "select appid,
                     cid, 
                     category_id,
                     isgame ,
                     ins_rate 
                   from tkdm.tkdm_data_active_detial_day
                   where ds between '2017-02-20' and '2017-02-26'
"
# result of query sql
aa <- sql(cid_query_sql)
# output schema
cid_schema <-  structType(structField("category_id_cid", "integer"), 
                      structField("isgame_cid", "integer"),
                      structField("cid_cid", "integer"),
                      structField("ins_rate_std", "double") )
# aggregate
cid <- gapply(x=aa,
         cols=c('category_id','isgame','cid'),
         function(key,x){
              data.frame(key,BootStarpMedian(x$ins_rate),stringsAsFactors = FALSE )
              },
         schema=cid_schema)
# collect(cid)
cid_output <- join(x=aa,y=cid,
                   joinExpr=aa$category_id==cid$category_id_cid & aa$isgame==cid$isgame_cid & aa$cid==cid$cid_cid,
                   joinType='inner'
                   )
cid_output <- select(cid_output,'appid','cid','category_id','isgame','ins_rate_std','ds')