//package mobvista.dmp.datasource.apptag.crawler_spark;
//
//public class CaiNiXiHuanIosDetailVisitor {
//}

package mobvista.dmp.datasource.apptag.crawler_spark;

import mobvista.dmp.util.HttpUtil;
import mobvista.dmp.util.MRUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.net.URI;

/**
 * author: houying
 * date  : 16-11-3
 * desc  :
 */
public class CaiNiXiHuanIosDetailVisitor {

    private final String lookupUrl = "https://itunes.apple.com/lookup?id=%s&country=%s";
    private final String googleSearchUrl = "https://www.google.com/search?q=id";
    private final String CN = "CN";
    private final String US = "US";
    private   HttpHost httpHost;
    private final String date;
    private ObjectMapper objectMapper;
    protected int notFound = 0;


    public CaiNiXiHuanIosDetailVisitor(HttpHost httpHost, String date) {
        this.httpHost = httpHost;
        this.objectMapper = new ObjectMapper();
        this.date = date;
    }

    public String visit(String packageName)  {
        String ret = "";
        try{
            if (packageName.startsWith("id")) {
                packageName = packageName.substring(2);
            }
            String url = String.format(lookupUrl, packageName, CN);

            String country = CN;
            HttpUtil.HttpResult result = HttpUtil.doGet(url, httpHost);
            String iosUrl = getPackageUrl(result.getContent());
            if (result.getStatusCode() != 200 || result.getContent() == null || iosUrl == null) {
                url = String.format(lookupUrl, packageName, US);
                result = HttpUtil.doGet(url, httpHost);
                if (result.getStatusCode() != 200 || result.getContent() == null) {
                    System.out.println(packageName+" json error: "+url);
                    System.out.println(result.getContent());
                    notFound++;
                    return "\u00041";
                }
                country = US;
            }
            iosUrl = getPackageUrl(result.getContent());
            if(iosUrl == null){
                String googleUrl = googleSearchUrl+packageName;
                HttpUtil.HttpResult googleRes = HttpUtil.doGet(googleUrl, httpHost);
                iosUrl = getTrackViewUrlByGoogle(googleRes.getContent());
                country = US;
                if(iosUrl == null){
                    System.out.println(packageName+" no trackViewUrl from lookup and google");
                    return "\u00042";
                }
            }

            String categories = getCategory(result.getContent());

            result = HttpUtil.doGet(iosUrl, httpHost);
            if (result.getStatusCode() != 200 || result.getContent() == null) {
                System.out.println(packageName+" url error: "+iosUrl);
                System.out.println(result.getContent());
                notFound++;
                return "\u00043";
            }
            ret = parse(result.getContent(), categories, country, packageName);
        }catch (Exception e){
            e.printStackTrace();
            System.out.println(packageName+" exception");
            return ret+"\u00044";
        }
        System.out.println(packageName+" succeed");
        return ret+"\u00040";
    }

    public String getTrackViewUrlByGoogle(String html) {
        Document mainContent = Jsoup.parse(html);
        Elements urls =  mainContent.select("div[id=search]").select("a[href]");
        Element e = urls.first();
        return e.attr("href");
    }

    public String getPackageUrl(String html) throws IOException {
        JsonNode node = objectMapper.readTree(html);
        int resultCount = node.get("resultCount").asInt();
        if (resultCount == 0) {
            notFound++;
            //logger.info("[resultCount = 0 for package name={} in itunes store]", packageName);
            return null;
        }
        node = node.get("results");
        if (node.size() > 0) {
            node = node.get(0);
            return node.get("trackViewUrl").asText();
        }
        return null;
    }

    public String getCategory(String html) throws IOException {
        JsonNode node = objectMapper.readTree(html);
        int resultCount = node.get("resultCount").asInt();
        if (resultCount == 0) {
            notFound++;
            //logger.info("[resultCount = 0 for package name={} in itunes store]", packageName);
            return null;
        }
        node = node.get("results");
        if (node.size() > 0) {
            node = node.get(0);
            JsonNode genreNode = node.get("genres");
            return genreNode.toString();
        }
        return null;
    }

    private static final String COAUTHOR_HEAD_LABAL_CN = "更多来自此开发人员的 App";
    private static final String MAY_LIKE_HEAD_LABAL_CN = "你可能也会喜欢";
    private static final String COAUTHOR_HEAD_LABAL_US = "More By This Developer";
    private static final String MAY_LIKE_HEAD_LABAL_US = "You May Also Like";
    private static final String APP_URL_ROOT = "https://apps.apple.com%s";

    public String parse(String html, String category, String country, String orignPkgName) throws IOException {
        Document mainContent = Jsoup.parse(html);

        Elements elements = mainContent.select("section[class=l-content-width section section--bordered]");

        String coauthorRes = "";
        String mayLikeRes = "";

        String COAUTHOR_HEAD_LABAL = COAUTHOR_HEAD_LABAL_CN;
        if(country.equals(US)){
            COAUTHOR_HEAD_LABAL = COAUTHOR_HEAD_LABAL_US;
        }

        String MAY_LIKE_HEAD_LABAL = MAY_LIKE_HEAD_LABAL_CN;
        if(country.equals(US)){
            MAY_LIKE_HEAD_LABAL = MAY_LIKE_HEAD_LABAL_US;
        }

        for (Element element : elements) {
            String head = element.select("h2[class=section__headline]").text();

            if (COAUTHOR_HEAD_LABAL.equals(head)) {
                coauthorRes = getMore(element);

                if (StringUtils.isEmpty(coauthorRes)) {
                    Elements coauthorPkgs = element.select("div[class=we-lockup__title ]");
                    for (Element coauthorPkg : coauthorPkgs) {
                        coauthorRes += ("\u0003" + coauthorPkg.attr("aria-label"));
                    }
                    if (coauthorRes.length() > "\u0003".length()) {
                        coauthorRes = coauthorRes.substring("\u0003".length());
                    }
                }

            } else if (MAY_LIKE_HEAD_LABAL.equals(head)) {
                mayLikeRes = getMore(element);

                if (StringUtils.isEmpty(mayLikeRes)) {
                    Elements mayLikePkgs = element.select("div[class=we-lockup__title ]");
                    for (Element mayLikePkg : mayLikePkgs) {
                        mayLikeRes += ("\u0003" + mayLikePkg.attr("aria-label"));
                    }
                    if (mayLikeRes.length() > "\u0003".length()) {
                        mayLikeRes = mayLikeRes.substring("\u0003".length());
                    }
                }

            }
        }
        return MRUtils.JOINER.join(
                orignPkgName,
                coauthorRes,
                mayLikeRes,
                category
        );
    }

    public String getMore(Element root) throws IOException {
        Elements elements = root.select("a[data-metrics-click]");

        StringBuilder sb = new StringBuilder();
        for (Element element : elements) {
            String url = element.attr("href");
            String[] urls = url.split("/");
            int len = urls.length;
            String iosPkg = urls[len-1];
            if(!iosPkg.startsWith("id")){
                continue;
            }
            sb.append("\u0003").append(iosPkg.substring(2));
        }
        if (sb.length() > "\u0003".length()) {
            return sb.substring("\u0003".length());
        } else {
            return "";
        }
    }

    public static void main(String[] args){
        CaiNiXiHuanIosDetailVisitor iosDetailVisitor = new CaiNiXiHuanIosDetailVisitor(null, "");
        String res = iosDetailVisitor.visit("1005391121");
        System.out.println(res);
    }
}

