package mobvista.prd.datasource.newall;

import com.google.common.collect.Lists;
import mobvista.prd.datasource.util.MRUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.List;
import java.util.regex.Pattern;

/**
 * Created by liushuai on 2017/3/20 0020.
 * desc  : 清洗3s设备每日安装的app数据
 */
public class TrackingInstallDailyMR {
    public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        Job job = Job.getInstance(conf, "TrackingInstallDailyMR");

        job.setJarByClass(TrackingInstallDailyMR.class);
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

        job.setMapperClass(TrackingInstallDailyMapper.class);
        job.setReducerClass(TrackingInstallDailyReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

    public static class TrackingInstallDailyMapper extends Mapper<LongWritable, Text, Text, Text> {
        String date;
        Text outKey = new Text();
        Text outValue = new Text();
        private Pattern idfaPtn = Pattern.compile("^[0-9A-F\\-]+$");
        private Pattern lineSplit = Pattern.compile("-");
        private Pattern match = Pattern.compile("^0*-0*-0*-0*-0*$");

        public void setup(Context context) throws IOException {
            date = context.getConfiguration().get("task.date");
        }

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            if (line.contains("\t")) {
                String[] fields = MRUtils.SPLITTER.split(line, -1);
                if (fields.length != 3) {
                    return;
                }
                outKey.set(fields[0]);
                outValue.set(fields[2]);
                context.write(outKey, outValue);
            } else {
                String[] fields = line.split(",", -1);
                String[] tmpDeviceId = lineSplit.split(fields[10], -1);
                String[] tmpDeviceId2 = lineSplit.split(fields[28], -1);
                if (tmpDeviceId.length == 5) {
                    if (match.matcher(fields[10]).matches()) {
                        return;
                    }
                    String platform = fixPlatform(fields[10]);
                    String deviceType = getDeviceType(platform);
                    outKey.set(fields[1]);
                    outValue.set(MRUtils.JOINER.join(fields[10], deviceType, platform, fields[1],fields[3], date));
                    context.write(outKey, outValue);
                } else if (tmpDeviceId2.length == 5) {
                    if (match.matcher(fields[28]).matches()) {
                        return;
                    }
                    String platform = fixPlatform(fields[28]);
                    String deviceType = getDeviceType(platform);
                    outKey.set(fields[1]);
                    outValue.set(MRUtils.JOINER.join(fields[28], deviceType, platform, fields[1],fields[3], date));
                    context.write(outKey, outValue);
                }
            }
        }

        private String fixPlatform(String deviceId) {
            if (idfaPtn.matcher(deviceId).matches()) {
                return "ios";
            }
            return "android";
        }

        private String getDeviceType(String platform) {
            switch (platform) {
            case "ios":
                return "idfa";
            case "android":
                return "gaid";
            case "adr":
                return "gaid";
            default:
                return "unknown";
            }
        }
    }

    public static class TrackingInstallDailyReducer extends Reducer<Text, Text, Text, NullWritable> {
        Text outKey = new Text();

        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            String packageName = "";
            List<String> list = Lists.newArrayList();
            for (Text val : values) {
                String[] fields = MRUtils.SPLITTER.split(val.toString(), -1);
                if (fields.length == 1) {
                    packageName = fields[0];
                } else {
                    list.add(val.toString());
                }
            }
            if (packageName.equals("")) {
                return;
            }
            for (String line : list) {
                String[] fields = MRUtils.SPLITTER.split(line, -1);
                String deviceId = fields[0].replaceAll("\"", "").replaceAll("\\[", "");

                //去掉IOS中id开头包名中的id
                if (packageName.matches("^id\\d+$") && "ios".equals(fields[2])) {
                    packageName = packageName.replace("id", "");
                }

                outKey.set(MRUtils.JOINER.join(deviceId, fields[1], fields[2], packageName, fields[4]));
                context.write(outKey, NullWritable.get());
            }
        }
    }
}
