package mobvista.dmp.demo

import java.net.URI

import mobvista.dmp.util.MRUtils
import org.apache.commons.cli.{BasicParser, Options}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

/**
  * @package: mobvista.dmp.demo
  * @author: wangjf
  * @date: 2020/3/13
  * @time: 5:37 下午
  * @email: jinfeng.wang@mobvista.com
  * @phone: 152-1062-7698
  */
class PackageTagJob extends Serializable {
  def commandOptions(): Options = {
    val options = new Options()
    options.addOption("input1", true, "input1")
    options.addOption("input2", true, "input2")
    options.addOption("output", true, "output")
    options
  }

  protected def run(args: Array[String]) {
    val parser = new BasicParser()
    val options = commandOptions()
    val commandLine = parser.parse(options, args)
    val input1 = commandLine.getOptionValue("input1")
    val input2 = commandLine.getOptionValue("input2")
    val output = commandLine.getOptionValue("output")

    val spark = SparkSession
      .builder()
      .appName("PackageTagJob")
      .config("spark.rdd.compress", "true")
      .config("spark.io.compression.codec", "lz4")
      .config("spark.sql.warehouse.dir", "s3://mob-emr-test/spark-warehouse")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()
    try {

      val set = new mutable.HashSet[String]
      val sc = spark.sparkContext
      val rdd1 = sc.textFile(input1).map(_.split(";")).map(rs => {
        val package_name = if (rs(0).matches("^id\\d+$")) {
          rs(0).replace("id", "")
        } else {
          rs(0)
        }
        val platform = rs(1)
        val tag_type = rs(2)
        val first_tag = rs(3)
        val second_tag = rs(4)
        val comment = rs(5)
        set.add(MRUtils.JOINER.join(package_name, platform.toLowerCase))
        MRUtils.JOINER.join(package_name, platform, tag_type, first_tag, second_tag, comment)
      })

      val rdd2 = sc.textFile(input2).map(_.split("\t")).map(rs => {
        val res = new ArrayBuffer[String]()
        val package_name = if (rs(0).matches("^id\\d+$")) {
          rs(0).replace("id", "")
        } else {
          rs(0)
        }
        val platform = rs(1)
        val tag_type = rs(2)
        val first_tag = rs(3)
        val second_tag = rs(4)
        val comment = rs(5)
        if (set.contains(MRUtils.JOINER.join(package_name, platform.toLowerCase))) {
          if (!tag_type.equals("category") && !tag_type.equals("rules")) {
            res += MRUtils.JOINER.join(package_name, platform, tag_type, first_tag, second_tag, comment)
          }
        } else {
          res += MRUtils.JOINER.join(package_name, platform, tag_type, first_tag, second_tag, comment)
        }
        res
      }).flatMap(l => l)

      FileSystem.get(new URI(s"s3://mob-emr-test"), sc.hadoopConfiguration).delete(new Path(output), true)
      rdd1.union(rdd2).coalesce(1).saveAsTextFile(output)


    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
  }
}

object PackageTagJob {
  def main(args: Array[String]): Unit = {
    new PackageTagJob().run(args)
  }
}