package mobvista.prd.datasource.table; import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.type.JavaType; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Created by Administrator on 2017/3/28 0028. * desc:得到每个pkg_name的URl,分类 */ public class AppUrlMR { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); conf.set("mapreduce.reduce.memory.mb", "1472"); Job job = Job.getInstance(conf, "CalcDeviceAgeMR"); job.setJarByClass(MergeAppIDMR.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setMapperClass(MergeAppMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static class MergeAppMapper extends Mapper<LongWritable, Text, Text, Text> { Map<String, String> dspPackageMap = Maps.newHashMap(); Map<String, String> iosPackageMap = Maps.newHashMap(); Text outKey = new Text(); Text outValue = new Text(); ObjectMapper objectMapper = new ObjectMapper(); JavaType listType = objectMapper.getTypeFactory().constructCollectionType(ArrayList.class, String.class); public void setup(Context context) throws IOException, InterruptedException { //读取CalcPackageDictMR的结果,放入Map中 String iosPath = context.getConfiguration().get("iosFile"); iosPackageMap = getAppUrl(iosPath,0,11,6,context); String dspPath = context.getConfiguration().get("adrFile"); dspPackageMap = getAppUrl(dspPath,0,5,2,context); } public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { try { String filePath = context.getConfiguration().get("map.input.file"); String line = value.toString(); String[] fields = line.split("\t", -1); if (filePath.contains("/etl_dsp_request_daily/")) {//dsp List<String> packageNameList = objectMapper.readValue(fields[10], listType);//json转换成list for (String packageName : packageNameList) { if (dspPackageMap.containsKey(packageName)) { outKey.set(fields[0]); outValue.set(fields[1] + "\t" + packageName + "\t" + dspPackageMap.get(packageName) + "\t" + fields[2]); context.write(outKey, outValue); } else if (iosPackageMap.containsKey(packageName)) { outKey.set(fields[0]); outValue.set(fields[1] + "\t" + packageName + "\t" + iosPackageMap.get(packageName) + "\t" + fields[2]); context.write(outKey, outValue); } else { outKey.set(fields[0]); outValue.set(fields[1] + "\t" + packageName + "\t" + "" + "\t" + "" + "\t" + fields[2]); context.write(outKey, outValue); } } } else if (filePath.contains("/etl_adn_sdk_request_daily/")) {//m系统 if (dspPackageMap.containsKey(fields[4])) { outKey.set(fields[0]); outValue.set(fields[1] + "\t" + fields[4] + "\t" + dspPackageMap.get(fields[4]) + "\t" + fields[2]); context.write(outKey, outValue); } else if (iosPackageMap.containsKey(fields[4])) { outKey.set(fields[0]); outValue.set(fields[1] + "\t" + fields[4] + "\t" + iosPackageMap.get(fields[4]) + "\t" + fields[2]); context.write(outKey, outValue); } else { outKey.set(fields[0]); outValue.set(fields[1] + "\t" + fields[4] + "\t" + "" + "\t" + "" + "\t" + fields[2]); context.write(outKey, outValue); } } else { context.getCounter("DMP", "fields.length error").increment(1l); } } catch (Exception e) { e.printStackTrace(); } } } public static Map<String, String> getAppUrl(String path, int id, int url, int category, Mapper.Context context) throws IOException { ObjectMapper objectMapper = new ObjectMapper(); JavaType listType = objectMapper.getTypeFactory().constructCollectionType(ArrayList.class, Map.class); Map<String, String> packageMap = Maps.newHashMap(); FileSystem fileSystem = FileSystem.get(URI.create(path), context.getConfiguration()); FileStatus[] statuses = fileSystem.listStatus(new Path(path)); BufferedReader reader = null; for (FileStatus status : statuses) { try { reader = new BufferedReader(new InputStreamReader(fileSystem.open(status.getPath()))); String readLine = ""; while ((readLine = reader.readLine()) != null) { String[] arr = readLine.split("\t", -1); if (arr[2].contains("name")) { List<Map<String, String>> packageNameList = objectMapper.readValue(arr[2], listType);//json转换成list packageMap.put(arr[id],arr[5] + "\t"+packageNameList.get(0).get("name")); } else { packageMap.put(arr[id], arr[url] + "\t" + arr[category]); } } } finally { if (reader != null) { reader.close(); } } } return packageMap; } }