GaInstallDailyMR.java 3.55 KB
package mobvista.dmp.datasource.ga.mapreduce;

import com.google.common.base.Joiner;
import mobvista.dmp.common.CommonMapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.regex.Pattern;

/**
 * author: houying
 * date  : 16-10-24
 * desc  : 清洗ga设备每日安装的app数据
 */
public class GaInstallDailyMR extends Configured implements Tool {

    public static class GaInstallDailyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        private Pattern splitPtn = Pattern.compile("\\|");
        private Joiner joiner = Joiner.on("\t").useForNull("");
        private Text outKey = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] array = splitPtn.split(value.toString());
            if (array.length != 4) {
                CommonMapReduce.setMetrics(context,"DMP","column_num_error",1);
                return;
            }
            String idType = null;
            String platform = null;
            if (array[1].equalsIgnoreCase("ios")) {
                idType = "idfa";
                platform = "ios";
            } else if (array[1].equalsIgnoreCase("android")) {
                idType = "gaid";
                platform = "adr";
            } else {
                return;
            }
            outKey.set(joiner.join(
                    array[0],
                    idType,
                    platform,
                    array[2],
                    array[3]
            ));
            context.write(outKey, NullWritable.get());
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        conf.set("mapreduce.task.io.sort.mb", "500");
        conf.set("mapreduce.reduce.java.opts", "-Xmx1536m");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        conf.set("mapreduce.reduce.shuffle.parallelcopies", "50");

        Job job = Job.getInstance(conf, "ods_ga_install_daily");
        job.setJarByClass(GaInstallDailyMR.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        Path outputPath = new Path(args[1]);
        FileSystem fileSystem = outputPath.getFileSystem(conf);
        if (fileSystem.exists(outputPath)) {
            fileSystem.delete(outputPath, true);
        }
        FileOutputFormat.setOutputPath(job, outputPath);
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

        job.setMapperClass(GaInstallDailyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        System.exit(ToolRunner.run(new Configuration(), new GaInstallDailyMR(), args));
    }
}