//package mobvista.dmp.datasource.apptag.crawler_spark;
//
//public class CaiNiXiHuanBundleVisitor {
//}

package mobvista.dmp.datasource.apptag.crawler_spark;

import mobvista.dmp.util.HttpUtil;
import mobvista.dmp.util.MRUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
 * Created by fl on 2017/7/12.
 */
public class CaiNiXiHuanBundleVisitor {
    private final String lookupUrl = "http://itunes.apple.com/lookup?bundleId=%s&country=%s";
    private final String CN = "CN";
    private final String US = "US";
    private  HttpHost httpHost;
    protected int notFound = 0;
    private final String date;

    private ObjectMapper objectMapper;

    public CaiNiXiHuanBundleVisitor(HttpHost httpHost, String date) {
        this.httpHost = httpHost;
        this.date = date;
        this.objectMapper = new ObjectMapper();
    }

    public String visit(String packageName) throws IOException {
        String ret = "";
        try {
            if (packageName.startsWith("id")) {
                packageName = packageName.substring(2);
            }
            String url = String.format(lookupUrl, packageName, CN);

            HttpUtil.HttpResult result = HttpUtil.doGet(url, httpHost);
            if (result.getStatusCode() != 200 || result.getContent() == null) {
                url = String.format(lookupUrl, packageName, US);
                result = HttpUtil.doGet(url, httpHost);
                if (result.getStatusCode() != 200 || result.getContent() == null) {
                    notFound++;
                    return null;
                }
            }
            String iosUrl = getPackageUrl(packageName,result.getContent());
            result = HttpUtil.doGet(iosUrl, httpHost);
            if (result.getStatusCode() != 200 || result.getContent() == null) {
                notFound++;
                return null;
            }
            ret = parse(result.getContent());
        }catch (Exception e){
            e.printStackTrace();
        }
        return ret;
    }

    public String getPackageUrl(String packageName, String html) throws IOException {
        JsonNode node = objectMapper.readTree(html);
        int resultCount = node.get("resultCount").asInt();
        if (resultCount == 0) {
            notFound++;
            //logger.info("[resultCount = 0 for package name={} in itunes store]", packageName);
            return null;
        }
        node = node.get("results");
        if (node.size() > 0) {
            node = node.get(0);
            return node.get("trackViewUrl").asText();
        }
        return null;
    }

    private static final String COAUTHOR_HEAD_LABAL = "更多来自此开发人员的 App";
    private static final String MAY_LIKE_HEAD_LABAL = "你可能也会喜欢";
    private static final String APP_URL_ROOT = "https://apps.apple.com%s";

    public String parse(String html) throws IOException {
        Document mainContent = Jsoup.parse(html);
        String packageName = mainContent.select("h1[class=product-header__title app-header__title]").first().childNodes().get(0).toString();
        Elements elements = mainContent.select("section[class=l-content-width section section--bordered]");

        String coauthorRes = "";
        String mayLikeRes = "";

        for (Element element : elements) {
            String head = element.select("h2[class=section__headline]").text();

            if (COAUTHOR_HEAD_LABAL.equals(head)) {
                String urlSuffix = element.select("a[class=link section__nav__see-all-link ember-view]").attr("href");
                coauthorRes = getMore(String.format(APP_URL_ROOT, urlSuffix));

                if (StringUtils.isEmpty(coauthorRes)) {
                    Elements coauthorPkgs = element.select("div[class=we-lockup__title ]");
                    for (Element coauthorPkg : coauthorPkgs) {
                        coauthorRes += ("\u0003" + coauthorPkg.attr("aria-label"));
                    }
                    if (coauthorRes.length() > "\u0003".length()) {
                        coauthorRes = coauthorRes.substring("\u0003".length());
                    }
                }

            } else if (MAY_LIKE_HEAD_LABAL.equals(head)) {
                String urlSuffix = element.select("a[class=link section__nav__see-all-link ember-view]").attr("href");
                mayLikeRes = getMore(String.format(APP_URL_ROOT, urlSuffix));

                if (StringUtils.isEmpty(mayLikeRes)) {
                    Elements mayLikePkgs = element.select("div[class=we-lockup__title ]");
                    for (Element mayLikePkg : mayLikePkgs) {
                        mayLikeRes += ("\u0003" + mayLikePkg.attr("aria-label"));
                    }
                    if (mayLikeRes.length() > "\u0003".length()) {
                        mayLikeRes = coauthorRes.substring("\u0003".length());
                    }
                }

            }
        }
        return MRUtils.JOINER.join(
                packageName,
                coauthorRes,
                mayLikeRes
        );
    }

    public String getMore(String url) throws IOException {
        HttpUtil.HttpResult result = HttpUtil.doGet(url, httpHost);
        if (result.getStatusCode() != 200 || result.getContent() == null) {
            return null;
        }

        Document mainContent = Jsoup.parse(result.getContent());
        Elements elements = mainContent.select("div[class=we-lockup__title]");

        StringBuilder sb = new StringBuilder();
        for (Element element : elements) {
            sb.append("\u0003").append(element.attr("aria-label"));
        }
        if (sb.length() > "\u0003".length()) {
            return sb.substring("\u0003".length());
        } else {
            return "";
        }
    }
}

