package mobvista.dmp.datasource.apptag.category;

import com.google.common.base.Charsets;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.util.List;

/**
 * author: houying
 * date  : 16-10-28
 * desc  : Google Play 的应用类别标签抓取，只抓取一次，平时不需要
 */
public class GooglePlayCategoryCrawler {

    private List<Category> parseCategory(Element element) {
        Element ulElement = element.child(0);
        List<Category> list = Lists.newArrayList();
        for (Element li: ulElement.children()) {
            if (li.attr("jsinstance").equals("0")) { //仅有一级分类
                list.addAll(parseOnlyOneLevelCategory(li));
            } else { //头为一级分类，之后为二级分类
                list.addAll(parseTwoLevelCategory(li));
            }
        }
        return list;
    }

    private List<Category> parseTwoLevelCategory(Element li) {
        String category = null;
        List<Category> list = Lists.newArrayList();
        for (Element element: li.select("a")) {
            String sub = null;
            if (element.attr("class").startsWith("parent")) {
                category = element.text();
                sub = "";
            } else if (element.attr("class").startsWith("child")) {
                sub = element.text();
            }
            list.add(new Category(category, sub, buildUrl(element.attr("href"))));
        }
        return list;
    }

    private List<Category> parseOnlyOneLevelCategory(Element li) {
        Elements aTagList = li.select("a[class=child-submenu-link]");
        List<Category> list = Lists.newArrayList();
        for (Element e: aTagList) {
            list.add(new Category(e.text(), "", buildUrl(e.attr("href"))));
        }
        return list;
    }

    private String buildUrl(String path) {
        return "https://play.google.com" + path;
    }

    private Element getCategoryPage() throws IOException {
        //local
        CharSource charSource = Files.asCharSource(new File("/home/houying/tmp/google_play.html"), Charsets.UTF_8);
        String html = charSource.read();
        Document document = Jsoup.parse(html);
        Elements elements = document.select("div[class=action-bar-dropdown-children-container]");
        for (Element element: elements) {
            String id = element.id();
            if (!Strings.isNullOrEmpty(id) && id.startsWith("action-dropdown-children-")) {
                return element.child(0);
            }
        }
        throw new RuntimeException("");
    }

    public List<Category> run() throws IOException {
        List<Category> list = parseCategory(getCategoryPage());
        //System.out.println(new ObjectMapper().writeValueAsString(list));
        return list;
    }

    public static void main(String[] args) throws IOException {
        new GooglePlayCategoryCrawler().run();
    }
}
