CommonInstallList.scala 11.3 KB
package mobvista.dmp.common

import com.google.gson.{JsonArray, JsonElement, JsonObject}
import mobvista.dmp.datasource.mpsdk.InstallInfo
import mobvista.prd.datasource.util.GsonUtil
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
import org.joda.time.format.DateTimeFormat

import java.net.URI
import java.util
import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer

/**
 * 填数据合并到安装列表通用程序,
 * 记成该类后,需实现数据处理方法即可
 * fengliang
 */
abstract class CommonInstallList extends CommonSparkJob with Serializable {

  protected val FIELD_SPLIT = "\t"
  protected var DAILY_STORE_FORMAT = "ORC"

  def run(args: Array[String]): Int = {
    var sc: SparkContext = null
    try {
      options.addOption("date", true, "[must] today")
      options.addOption("oldInput", true, "[must] yestoday install data path")
      val commandLine = commParser.parse(options, args)
      if (!checkMustOption(commandLine)) {
        printUsage(options)
        return 1
      } else {
        printOptions(commandLine)
      }

      val date = commandLine.getOptionValue("date")
      val input = commandLine.getOptionValue("input")
      val output = commandLine.getOptionValue("output")
      val oldInput = commandLine.getOptionValue("oldInput")
      val parallelism = commandLine.getOptionValue("parallelism").toInt
      val coalesce = commandLine.getOptionValue("coalesce").toInt

      val expireDate = DateTimeFormat.forPattern("yyyy-MM-dd").parseDateTime(date).minusMonths(12).toString("yyyy-MM-dd")

      val spark = SparkSession
        .builder()
        .appName("MPSDKTotal")
        .config("spark.rdd.compress", "true")
        .config("spark.default.parallelism", parallelism)
        .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .getOrCreate()

      //  任务重试过程中,路径已经存在造成无法写入的bug
      FileSystem.get(new URI(s"s3://mob-emr-test"), spark.sparkContext.hadoopConfiguration).delete(new Path(output), true)

      //  FileSystem.get(spark.sparkContext.hadoopConfiguration).delete(new Path(output), true)

      val version = spark.conf.get("spark.app.version", "-1")

      val tag = spark.conf.get("spark.app.tag", "0")
      val read_text_file_tag = spark.conf.get("spark.read.textfile.tag", "0")
      //  val dailyDF = spark.read.format(DAILY_STORE_FORMAT).load(input)
      val dailyDF = if (tag.equals("-1")) {
        spark.emptyDataFrame
      } else if (read_text_file_tag.equals("1")) {
        println("wo  ai   nimen=======")
        //        添加对text文本格式的读取   (2021.02.05) 目前alipay_activation   alipay_acquisition 两个分区用到
        val schema: StructType = StructType(Array(
          StructField("device_id", StringType),
          StructField("device_type", StringType),
          StructField("platform", StringType),
          StructField("package_name", StringType)
        ))

        val textRdd: RDD[Row] = spark.sparkContext.textFile(input).map(element => {
          Row(element.split("\t", -1)(0), element.split("\t", -1)(1), element.split("\t", -1)(2), element.split("\t", -1)(3))
        })
        spark.createDataFrame(textRdd, schema)
      }
      else {
        spark.read.orc(input)
      }

      val dailyData =
        if (version.equals("-1")) {
          dailyDF.rdd.map(convertDayRow)
        } else {
          dailyDF.rdd.map(convertDayRow).filter(rs => {
            rs(8).equals(version)
          })
        }

      val dailyRDD = dailyData.flatMap(processDailyData(_, date))
        .filter(tuple => {
          val array = splitFun(tuple._1)
          (array(0).matches(didPtn) && !allZero.equals(array(0))) || array(0).matches(imeiPtn) || array(0).matches(andriodIdPtn) ||
            (!array(0).matches(md5Ptn) && array(0).matches(imeiMd5Ptn)) || ("oaid".equalsIgnoreCase(array(1)) && array(0).matches(oaidAnotherPtn)) ||
            ("ruid".equalsIgnoreCase(array(1)) && array(0).length > 16)
          // 新增 andriodId 需求
          // 新增 ruid 需求
        })
        .groupByKey()
        .map(tuple => {
          val set = new util.HashSet[InstallInfo]()
          tuple._2.foreach(info => {
            set.add(info)
          })
          (tuple._1, set)
        })

      sc = spark.sparkContext
      val installRDD = sc.textFile(oldInput)
        .map(splitFun(_))
        .filter(array =>
          //  新增 ruid 过滤规则
          array.length >= 4 && (
            (array(0).matches(didPtn) && !allZero.equals(array(0))) || array(0).matches(imeiPtn) || array(0).matches(andriodIdPtn) || (!array(0).matches(md5Ptn) && array(0).matches(imeiMd5Ptn)) ||
              ("oaid".equalsIgnoreCase(array(1)) && array(0).matches(oaidAnotherPtn)) || array(0).length > 16)
        )
        // 新增andriodId 需求, 新增imeiMd5 需求
        .map(splits => {
          (s"${splits(0)}$DATA_SPLIT${splits(1)}$DATA_SPLIT${splits(2)}", splits(3))
        })

      dailyRDD.fullOuterJoin(installRDD)
        .map(tuple => {
          val key = tuple._1
          val valTuple = tuple._2
          val dailyOpt = valTuple._1
          val totalOpt = valTuple._2

          var tags = ""
          if (dailyOpt == None && totalOpt != None) {
            // 需删除过期的安装信息
            val installArray = new JsonArray
            GsonUtil.String2JsonArray(totalOpt.get)
              .foreach(element => {
                val installDate = getJsonValue(element, "date")
                // 验证安装信息是否已超出期限
                if (StringUtils.isNotEmpty(installDate)) {
                  if (installDate.compareTo(expireDate) >= 0) {
                    installArray.add(element)
                  }
                } else {
                  installArray.add(element)
                }
              })
            tags = installArray.toString
          } else if (dailyOpt != None && totalOpt == None) {
            val jsonArray = new JsonArray
            dailyOpt.get.foreach(installInfo => {
              var installDate = installInfo.getDate()
              if (installDate.compareTo(date) > 0) {
                installDate = date
              }
              if (jsonArray.size() < 1000) {
                val json = new JsonObject

                /**
                 * daily 中所带的 package 的 install_date 可能过期
                 */
                if (installDate.compareTo(expireDate) > 0) {
                  json.addProperty("date", installDate)
                  json.addProperty("package_name", installInfo.getPackage_name().replaceAll("[^0-9a-zA-Z\\.\\_]+", ""))
                  jsonArray.add(json)
                }
              }
            })
            tags = jsonArray.toString
          } else if (dailyOpt != None && totalOpt != None) {
            val map = new util.HashMap[String, String]()
            GsonUtil.String2JsonArray(totalOpt.get)
              .foreach(element => {
                val installPackage = getJsonValue(element, "package_name")
                val installDate = getJsonValue(element, "date")
                // 验证安装信息是否已超出期限
                if (StringUtils.isNotEmpty(installDate)) {
                  if (installDate.compareTo(expireDate) >= 0) {
                    map.put(installPackage, installDate)
                  }
                } else {
                  map.put(installPackage, installDate)
                }
              })
            dailyOpt.get
              .foreach(installInfo => {
                var installDate = installInfo.getDate()
                if (installDate.compareTo(date) > 0) {
                  installDate = date
                }

                val packageName = installInfo.getPackage_name().replaceAll("[^0-9a-zA-Z\\.\\_]+", "")

                // adn 上报业务  新增安装包处理逻辑;该逻辑不会影响其他业务线数据处理logic
                if (StringUtils.isNotBlank(packageName)) {
                  if (packageName.endsWith(".notinstall")) { //去掉包名.delete 和 包名
                    val packageNameDel1 = packageName.replace(".notinstall", "") //包名
                    val packageNameDel2 = packageNameDel1 + ".delete" //包名.delete
                    map.remove(packageNameDel1)
                    map.remove(packageNameDel2)
                  } else if (packageName.endsWith(".delete")) { //去掉包名.notinstall 和 包名
                    val packageNameDel1 = packageName.replace(".delete", "") //包名
                    val packageNameDel2 = packageNameDel1 + ".notinstall" //包名.notinstall
                    map.remove(packageNameDel1)
                    map.remove(packageNameDel2)
                  } else { // 不以 notinstall delete 为后缀,要去掉 安装包.notinstall和安装包.delete
                    val packageNameDel1 = packageName + ".notinstall" //包名.notinstall
                    val packageNameDel2 = packageName + ".delete" //包名.delete
                    map.remove(packageNameDel1)
                    map.remove(packageNameDel2)
                  }
                }
                val tmpDate = map.get(packageName)
                if (tmpDate != null) {
                  if (tmpDate.compareTo(installDate) < 0) {
                    map.put(packageName, installDate)
                  }
                } else {
                  if (installDate.compareTo(expireDate) >= 0) {
                    map.put(packageName, installDate)
                  }
                }
              })

            val jsonArray = new JsonArray
            map.entrySet()
              .foreach(entry => {
                if (jsonArray.size() < 1000) {
                  val json = new JsonObject
                  if (StringUtils.isNotEmpty(entry.getValue)) {
                    json.addProperty("date", entry.getValue)
                  }
                  json.addProperty("package_name", entry.getKey)
                  jsonArray.add(json)
                }
              })
            tags = jsonArray.toString
          }

          s"${key}$DATA_SPLIT${tags}"
        }).repartition(coalesce)
        .saveAsTextFile(output, classOf[GzipCodec])

    } finally {
      if (sc != null) {
        sc.stop()
      }
    }
    0
  }

  def getJsonValue(element: JsonElement, key: String): String = {
    if (element != null && !element.isJsonNull) {
      element.getAsJsonObject.get(key).getAsString
    } else {
      ""
    }
  }

  /**
   * 将row转化为Array[String]
   *
   * @param row
   * @return
   */
  def convertDayRow(row: Row): Array[String] = {
    val buffer = new ArrayBuffer[String]()
    for (i <- 0 until row.size) {
      if (row.get(i) != null) {
        buffer += row.get(i).toString
      } else {
        buffer += ""
      }
    }
    buffer.toArray
  }

  /**
   * 解析天处理结果数据
   *
   * @param array
   * @param date
   * @return
   */
  def processDailyData(array: Array[String], date: String): Array[Tuple2[String, InstallInfo]]
}

case class InstallListVO(key: String, package_name: String, date: String)