package mobvista.dmp.datasource.apptag.category;

import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import org.codehaus.jackson.map.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.util.List;

/**
 * author: houying
 * date  : 16-10-28
 * desc  : Ios App Store 应用商店标签抓取
 */
public class IosAppStoreCategoryCrawler {

    public static void main(String[] args) throws IOException {
        new IosAppStoreCategoryCrawler().run();
    }

    public List<Category> run() throws IOException {
        List<Category> list = parseCategory(getCategoryPage());
        System.out.println(new ObjectMapper().writeValueAsString(list));
        return list;
    }

    private List<Category> parseCategory(Element div) {
        List<Category> list = Lists.newArrayList();
        for (Element ul: div.children()) {
            for (Element li : ul.children()) {
                String category = null;
                for (Element tag : li.children()) {
                    if (tag.tagName().equals("a")) {
                        category = tag.text();
                        list.add(new Category(category, "", tag.attr("href")));
                    } else if (tag.tagName().equals("ul")) {
                        for (Element a : tag.select("a")) {
                            list.add(new Category(category, a.text(), a.attr("href")));
                        }
                    }
                }
            }
        }
        return list;
    }

    private Element getCategoryPage() throws IOException {
        CharSource charSource = Files.asCharSource(new File("/home/houying/tmp/app_store.html"), Charsets.UTF_8);
        String html = charSource.read();
        Document document = Jsoup.parse(html);
        Elements elements = document.select("div[class=grid3-column]");
        return elements.first();
    }
}
