package mobvista.dmp.main

import org.apache.commons.cli.{BasicParser, Options}
import org.apache.commons.lang.StringUtils
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

class Convert2Parquet extends Serializable{

  def run(args: Array[String]) : Int = {

    val options = buildOptions
    val parser = new BasicParser
    val commandLine = parser.parse(options, args)

    val input = commandLine.getOptionValue("input")
    val output = commandLine.getOptionValue("output")
    val schemaStr = commandLine.getOptionValue("schema")

    val spark = SparkSession
      .builder()
      .appName("Convert2Parquet")
      .config("spark.rdd.compress", "true")
      .config("spark.driver.userClassPathFirst", "true")
      .config("spark.executor.userClassPathFirst", "true")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext
    try {
      val dataSplit = "\t"
      val fields = schemaStr.split(" ")
        .map(fieldName => StructField(fieldName, StringType, true))
      val schema = StructType(fields)

      val data = sc.textFile(input)
      val rowRDD = data.map(StringUtils.splitPreserveAllTokens(_, dataSplit, -1).toSeq)
        .map(Row.fromSeq(_))

      val df = spark.createDataFrame(rowRDD, schema)
      df.write
        .option("spark.sql.parquet.compression.codec", "gzip")
        .parquet(output)

    } finally {
      if (spark != null) {
        spark.stop()
      }
    }
    0
  }

  def buildOptions: Options = {
    val options = new Options
    options.addOption("input", true, "input path")
    options.addOption("output", true, "output path")
    options.addOption("schema", true, "schema file path")
    options
  }

}

object Convert2Parquet {
  def main(args: Array[String]): Unit = {
    new Convert2Parquet().run(args)
  }
}
