package mobvista.prd.datasource.test; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.common.io.Files; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import mobvista.dmp.util.MRUtils; import mobvista.prd.datasource.util.GsonUtil; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.Text; import org.apache.poi.ss.usermodel.*; import org.codehaus.jackson.map.ObjectMapper; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.*; import java.nio.charset.Charset; import java.util.*; /** * Created by fl on 2017/6/13. */ public class Test { StringBuilder builder = new StringBuilder(); private ObjectMapper objectMapper = new ObjectMapper(); private final static String GOOGLE_URL = "https://play.google.com/store/apps/details?id="; public static void main(String[] args) throws Exception { List<String> list = Files.readLines(new File("/Users/fl/Downloads/segment_tag"), Charset.defaultCharset()); List<String> list1 = Files.readLines(new File("/Users/fl/Downloads/dim_dc_segemnt"), Charset.defaultCharset()); Set<String> set = new HashSet<>(); Set<String> set1 = new HashSet<>(); for (int i =0; i < list1.size(); i++) { String[] strs1 = list1.get(i).split("\t"); set1.add(strs1[0]); } for (int i =0; i < list.size(); i++) { String[] strs1 = list.get(i).split("\t"); set.add(strs1[0]); } System.out.println(set.size()); System.out.println(set1.size()); for (String str : set1) { if (!set.contains(str)) { System.out.println(str); } } } public static String getCellValue(Cell cell){ if (cell != null) { if (cell.getCellType() == 0) { return String.valueOf((long)cell.getNumericCellValue()); } else { return cell.getStringCellValue(); } } else { return ""; } } public void search() { int nums[] = {1,2,3,4,6,6,7,8,9,10}; int left = 0, mid = 0, right = nums.length - 1, key = 6; // while (left < right) { // mid = left + ((right - left)>> 1); // if (nums[mid] < key) { // left = mid + 1; // } else { // right = mid; // } // } // // if (nums[right] == key) { // System.out.println(right); // } else { // System.out.println("no found"); // } while (left < right) { mid = left + ((right - left + 1) >> 2); if (nums[mid] <= key) { left = mid; } else { right = mid - 1; } } if (nums[left] == key) { System.out.println(left); } else { System.out.println("no found"); } } public String parse(String packageName, String html) throws IOException { if (html == null) { return null; } Element mainContent = Jsoup.parse(html).select("div[class=main-content]").first(); Element infoBoxTop = mainContent.select("div[class=info-box-top]").first(); String url = GOOGLE_URL + packageName; String appName = infoBoxTop.select("div[class=id-app-title]").first().text(); Elements cateElements = infoBoxTop.select("a[class=document-subtitle category]"); List<Map<String, String>> categoryList = Lists.newArrayList(); for (Element category: cateElements) { String code = category.attr("href").replace("/store/apps/category/", ""); String name = category.child(0).text(); Map<String, String> map = Maps.newHashMap(); map.put("code", code); map.put("name", name); categoryList.add(map); } Elements elements = mainContent.select("span[class=reviews-num"); String userRatingCount = elements.size() == 0 ? "0": elements.first().text(); //用户评分 Element otherInfoElement = mainContent.select("div[class=details-section metadata]").first(); elements = otherInfoElement.select("div[itemprop=numDownloads"); String download = elements.size() == 0 ? "0":elements.first().text(); return MRUtils.JOINER.join( packageName, appName, objectMapper.writeValueAsString(categoryList), userRatingCount, download, url ); } private static int BKDRHash(String deviceId) { char[] chars = deviceId.toCharArray(); int hash = 0; for (char c: chars) { hash += 131 * hash + c; } return hash; } public static byte[] intToBytes(int i) { byte[] value = new byte[4]; value[3] = (byte) ((i & 0xff000000) >> 24); value[2] = (byte) ((i & 0x00ff0000) >> 16); value[1] = (byte) ((i & 0x0000ff00) >> 8); value[0] = (byte) ((i & 0x000000ff)); return value; } protected void reduce(Text key, Set<Text> values) throws IOException, InterruptedException { Set<String> pkgSet = Sets.newHashSet(); Set<Integer> segmentIdSet = Sets.newHashSet(); String[] outFields = null; //new String[11]; Set<String> androidId = Sets.newHashSet(); for (Text value: values) { String line = value.toString(); String[] array = MRUtils.SPLITTER.split(line, -1); if (array.length != 16) { continue; } if (outFields == null) { outFields = new String[13]; outFields[0] = key.toString(); //device_id \t device_type for (int i = 1; i < 13; i++) { outFields[i] = array[i - 1]; } } pkgSet.add(array[8]); if (!array[9].isEmpty()) { androidId.add(array[9]); } builder.setLength(0); // builder.append(detailOutPath) // .append(",") //用于分隔输出路径和输出数据真实key,逗号左边为路径,后边为key // .append(key.toString()) //device_id + "\t" + device_type // .append("\t") // .append(array[0]); //platform // outKey.set(builder.toString()); System.out.println("outValue = " + MRUtils.JOINER.join( array[14], // time array[2], // ip array[11], // geo array[12], // longitude array[13] // latitude )); String segment = array[15]; if (segment.startsWith("[")) { JsonArray segArray = GsonUtil.String2JsonArray(segment); for (JsonElement segElement : segArray) { segmentIdSet.add(segElement.getAsJsonObject().get("id").getAsInt()); } } // context.write(outKey, outValue); } if (outFields != null) { outFields[9] = objectMapper.writeValueAsString(pkgSet); outFields[10] = MRUtils.join(androidId, ","); outFields[12] = objectMapper.writeValueAsString(segmentIdSet); // outKey.set(outPath + ", "); // outValue.set(MRUtils.join(outFields, "\t")); // context.write(outKey, outValue); System.out.println("outValue = " + MRUtils.join(outFields, "\t")); } } }