package org.apdplat.extractor.html.impl;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apdplat.extractor.html.HtmlExtractor;
import org.apdplat.extractor.html.model.CssPath;
import org.apdplat.extractor.html.model.ExtractFailLog;
import org.apdplat.extractor.html.model.ExtractFunction;
import org.apdplat.extractor.html.model.ExtractResult;
import org.apdplat.extractor.html.model.ExtractResultItem;
import org.apdplat.extractor.html.model.HtmlTemplate;
import org.apdplat.extractor.html.model.UrlPattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apdplat/extractor/html/impl/DefaultHtmlExtractor.class */
public class DefaultHtmlExtractor implements HtmlExtractor {
    private static final Logger LOGGER = LoggerFactory.getLogger(DefaultHtmlExtractor.class);
    private ExtractRegular extractRegular;

    public DefaultHtmlExtractor(ExtractRegular extractRegular) {
        this.extractRegular = extractRegular;
    }

    @Override // org.apdplat.extractor.html.HtmlExtractor
    public List<ExtractResult> extract(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        List<HtmlTemplate> htmlTemplate = this.extractRegular.getHtmlTemplate(str);
        if (htmlTemplate.isEmpty()) {
            LOGGER.debug("没有为此URL指定模板：" + str);
            return arrayList;
        }
        try {
            Document parse = Jsoup.parse(str2);
            String str3 = "";
            String str4 = "";
            Iterator it = parse.select("meta").iterator();
            while (it.hasNext()) {
                Element element = (Element) it.next();
                String attr = element.attr("name");
                if ("keywords".equals(attr)) {
                    str3 = element.attr("content");
                }
                if ("description".equals(attr)) {
                    str4 = element.attr("content");
                }
            }
            HashSet hashSet = new HashSet();
            for (HtmlTemplate htmlTemplate2 : htmlTemplate) {
                if (hashSet.contains(htmlTemplate2.getTableName())) {
                    LOGGER.debug("多个模板定义的tableName重复，这有可能会导致数据丢失，检查UrlPattern下定义的模板：" + htmlTemplate2.getUrlPattern().getUrlPattern());
                    LOGGER.debug(htmlTemplate.toString());
                }
                hashSet.add(htmlTemplate2.getTableName());
                try {
                    ExtractResult extractHtmlTemplate = extractHtmlTemplate(str, htmlTemplate2, parse);
                    if (extractHtmlTemplate.getExtractFailLogs().isEmpty() && extractHtmlTemplate.getExtractResultItems().isEmpty()) {
                        LOGGER.debug(str + " 的模板 " + htmlTemplate2.getTemplateName() + " 未抽取到");
                    } else {
                        extractHtmlTemplate.setContent(str2.getBytes("utf-8"));
                        extractHtmlTemplate.setEncoding("utf-8");
                        extractHtmlTemplate.setKeywords(str3);
                        extractHtmlTemplate.setDescription(str4);
                        arrayList.add(extractHtmlTemplate);
                    }
                } catch (Exception e) {
                    LOGGER.error("页面模板抽取失败：" + htmlTemplate2.getTemplateName(), e);
                }
            }
        } catch (Exception e2) {
            LOGGER.error("抽取网页出错: " + str, e2);
        }
        return arrayList;
    }

    private ExtractResult extractHtmlTemplate(String str, HtmlTemplate htmlTemplate, Document document) {
        ExtractResult extractResult = new ExtractResult();
        extractResult.setUrl(str);
        extractResult.setTableName(htmlTemplate.getTableName());
        for (CssPath cssPath : htmlTemplate.getCssPaths()) {
            Iterator it = document.select(cssPath.getCssPath()).iterator();
            while (it.hasNext()) {
                Element element = (Element) it.next();
                String text = StringUtils.isBlank(cssPath.getAttr()) ? element.text() : element.attr(cssPath.getAttr());
                if (!StringUtils.isNotBlank(text)) {
                    ExtractFailLog extractFailLog = new ExtractFailLog();
                    extractFailLog.setUrl(str);
                    extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                    extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                    extractFailLog.setCssPath(cssPath.getCssPath());
                    extractFailLog.setExtractExpression("");
                    extractFailLog.setTableName(htmlTemplate.getTableName());
                    extractFailLog.setFieldName(cssPath.getFieldName());
                    extractFailLog.setFieldDescription(cssPath.getFieldDescription());
                    extractResult.addExtractFailLog(extractFailLog);
                    return extractResult;
                }
                if (cssPath.hasExtractFunction()) {
                    for (ExtractFunction extractFunction : cssPath.getExtractFunctions()) {
                        text = ExtractFunctionExecutor.execute(text, document, cssPath, extractFunction.getExtractExpression());
                        if (text == null) {
                            ExtractFailLog extractFailLog2 = new ExtractFailLog();
                            extractFailLog2.setUrl(str);
                            extractFailLog2.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                            extractFailLog2.setTemplateName(htmlTemplate.getTemplateName());
                            extractFailLog2.setCssPath(cssPath.getCssPath());
                            extractFailLog2.setExtractExpression(extractFunction.getExtractExpression());
                            extractFailLog2.setTableName(htmlTemplate.getTableName());
                            extractFailLog2.setFieldName(extractFunction.getFieldName());
                            extractFailLog2.setFieldDescription(extractFunction.getFieldDescription());
                            extractResult.addExtractFailLog(extractFailLog2);
                            return extractResult;
                        }
                        ExtractResultItem extractResultItem = new ExtractResultItem();
                        extractResultItem.setField(extractFunction.getFieldName());
                        extractResultItem.setValue(text);
                        extractResult.addExtractResultItem(extractResultItem);
                    }
                } else {
                    ExtractResultItem extractResultItem2 = new ExtractResultItem();
                    extractResultItem2.setField(cssPath.getFieldName());
                    extractResultItem2.setValue(text);
                    extractResult.addExtractResultItem(extractResultItem2);
                }
            }
        }
        return extractResult;
    }

    private static void usage1() {
        ArrayList arrayList = new ArrayList();
        UrlPattern urlPattern = new UrlPattern();
        urlPattern.setUrlPattern("http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html");
        HtmlTemplate htmlTemplate = new HtmlTemplate();
        htmlTemplate.setTemplateName("网易财经频道");
        htmlTemplate.setTableName("finance");
        urlPattern.addHtmlTemplate(htmlTemplate);
        CssPath cssPath = new CssPath();
        cssPath.setCssPath("h1");
        cssPath.setFieldName("title");
        cssPath.setFieldDescription("标题");
        htmlTemplate.addCssPath(cssPath);
        CssPath cssPath2 = new CssPath();
        cssPath2.setCssPath("div#endText");
        cssPath2.setFieldName("content");
        cssPath2.setFieldDescription("正文");
        htmlTemplate.addCssPath(cssPath2);
        arrayList.add(urlPattern);
        int i = 1;
        for (ExtractResult extractResult : new DefaultHtmlExtractor(ExtractRegular.getInstance(arrayList)).extract("http://money.163.com/08/1219/16/4THR2TMP002533QK.html", new JSoupHtmlFetcher().fetch("http://money.163.com/08/1219/16/4THR2TMP002533QK.html"))) {
            int i2 = i;
            i++;
            System.out.println(i2 + "、网页 " + extractResult.getUrl() + " 的抽取结果");
            if (extractResult.isSuccess()) {
                Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
                for (String str : extractResultItems.keySet()) {
                    List<ExtractResultItem> list = extractResultItems.get(str);
                    if (list.size() > 1) {
                        int i3 = 1;
                        System.out.println("\t多值字段:" + str);
                        Iterator<ExtractResultItem> it = list.iterator();
                        while (it.hasNext()) {
                            int i4 = i3;
                            i3++;
                            System.out.println("\t\t" + i4 + "、" + str + " = " + it.next().getValue());
                        }
                    } else {
                        System.out.println("\t" + str + " = " + list.get(0).getValue());
                    }
                }
                System.out.println("\tdescription = " + extractResult.getDescription());
                System.out.println("\tkeywords = " + extractResult.getKeywords());
            } else {
                System.out.println("抽取失败：");
                for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) {
                    System.out.println("\turl:" + extractFailLog.getUrl());
                    System.out.println("\turlPattern:" + extractFailLog.getUrlPattern());
                    System.out.println("\ttemplateName:" + extractFailLog.getTemplateName());
                    System.out.println("\tfieldName:" + extractFailLog.getFieldName());
                    System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription());
                    System.out.println("\tcssPath:" + extractFailLog.getCssPath());
                    if (extractFailLog.getExtractExpression() != null) {
                        System.out.println("\textractExpression:" + extractFailLog.getExtractExpression());
                    }
                }
            }
        }
    }

    private static void usage2() {
        int i = 1;
        for (ExtractResult extractResult : new DefaultHtmlExtractor(ExtractRegular.getInstance("http://localhost:8080/html-extractor-web/api/all_extract_regular.jsp", "localhost", 6379)).extract("http://money.163.com/08/1219/16/4THR2TMP002533QK.html", new JSoupHtmlFetcher().fetch("http://money.163.com/08/1219/16/4THR2TMP002533QK.html"))) {
            int i2 = i;
            i++;
            System.out.println(i2 + "、网页 " + extractResult.getUrl() + " 的抽取结果");
            if (extractResult.isSuccess()) {
                Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
                for (String str : extractResultItems.keySet()) {
                    List<ExtractResultItem> list = extractResultItems.get(str);
                    if (list.size() > 1) {
                        int i3 = 1;
                        System.out.println("\t多值字段:" + str);
                        Iterator<ExtractResultItem> it = list.iterator();
                        while (it.hasNext()) {
                            int i4 = i3;
                            i3++;
                            System.out.println("\t\t" + i4 + "、" + str + " = " + it.next().getValue());
                        }
                    } else {
                        System.out.println("\t" + str + " = " + list.get(0).getValue());
                    }
                }
                System.out.println("\tdescription = " + extractResult.getDescription());
                System.out.println("\tkeywords = " + extractResult.getKeywords());
            } else {
                System.out.println("抽取失败：");
                for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) {
                    System.out.println("\turl:" + extractFailLog.getUrl());
                    System.out.println("\turlPattern:" + extractFailLog.getUrlPattern());
                    System.out.println("\ttemplateName:" + extractFailLog.getTemplateName());
                    System.out.println("\tfieldName:" + extractFailLog.getFieldName());
                    System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription());
                    System.out.println("\tcssPath:" + extractFailLog.getCssPath());
                    if (extractFailLog.getExtractExpression() != null) {
                        System.out.println("\textractExpression:" + extractFailLog.getExtractExpression());
                    }
                }
            }
        }
    }

    private static void usage3() {
        ArrayList arrayList = new ArrayList();
        UrlPattern urlPattern = new UrlPattern();
        urlPattern.setUrlPattern("http://list.jd.com/list.html\\?cat=([\\d,]+)");
        HtmlTemplate htmlTemplate = new HtmlTemplate();
        htmlTemplate.setTemplateName("京东商品");
        htmlTemplate.setTableName("jd_goods");
        urlPattern.addHtmlTemplate(htmlTemplate);
        CssPath cssPath = new CssPath();
        cssPath.setCssPath("html body div div div ul li div div.p-name");
        cssPath.setFieldName("name");
        cssPath.setFieldDescription("名称");
        htmlTemplate.addCssPath(cssPath);
        CssPath cssPath2 = new CssPath();
        cssPath2.setCssPath("html body div div div ul li div div.p-name a");
        cssPath2.setAttr("href");
        cssPath2.setFieldName("link");
        cssPath2.setFieldDescription("链接");
        htmlTemplate.addCssPath(cssPath2);
        CssPath cssPath3 = new CssPath();
        cssPath3.setCssPath("html body div div div ul li div div.p-price strong");
        cssPath3.setFieldName("price");
        cssPath3.setFieldDescription("价格");
        htmlTemplate.addCssPath(cssPath3);
        arrayList.add(urlPattern);
        int i = 1;
        for (ExtractResult extractResult : new DefaultHtmlExtractor(ExtractRegular.getInstance(arrayList)).extract("http://list.jd.com/list.html?cat=9987,653,655", new JSoupHtmlFetcher().fetch("http://list.jd.com/list.html?cat=9987,653,655"))) {
            int i2 = i;
            i++;
            System.out.println(i2 + "、网页 " + extractResult.getUrl() + " 的抽取结果");
            if (extractResult.isSuccess()) {
                Map<String, List<ExtractResultItem>> extractResultItems = extractResult.getExtractResultItems();
                for (String str : extractResultItems.keySet()) {
                    List<ExtractResultItem> list = extractResultItems.get(str);
                    if (list.size() > 1) {
                        int i3 = 1;
                        System.out.println("\t多值字段:" + str);
                        Iterator<ExtractResultItem> it = list.iterator();
                        while (it.hasNext()) {
                            int i4 = i3;
                            i3++;
                            System.out.println("\t\t" + i4 + "、" + str + " = " + it.next().getValue());
                        }
                    } else {
                        System.out.println("\t" + str + " = " + list.get(0).getValue());
                    }
                }
                System.out.println("\tdescription = " + extractResult.getDescription());
                System.out.println("\tkeywords = " + extractResult.getKeywords());
            } else {
                System.out.println("抽取失败：");
                for (ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()) {
                    System.out.println("\turl:" + extractFailLog.getUrl());
                    System.out.println("\turlPattern:" + extractFailLog.getUrlPattern());
                    System.out.println("\ttemplateName:" + extractFailLog.getTemplateName());
                    System.out.println("\tfieldName:" + extractFailLog.getFieldName());
                    System.out.println("\tfieldDescription:" + extractFailLog.getFieldDescription());
                    System.out.println("\tcssPath:" + extractFailLog.getCssPath());
                    if (extractFailLog.getExtractExpression() != null) {
                        System.out.println("\textractExpression:" + extractFailLog.getExtractExpression());
                    }
                }
            }
        }
    }

    public static void main(String[] strArr) {
        usage3();
    }
}
