package com.casm.acled.crawler.scraper;

import com.casm.acled.crawler.ScraperNotFoundException;
import com.casm.acled.crawler.util.Util;
import com.casm.acled.entities.source.Source;
import com.norconex.importer.doc.ImporterMetadata;
import com.norconex.importer.handler.ImporterHandlerException;
import com.norconex.importer.handler.tagger.impl.DOMTagger;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import uk.ac.susx.tag.norconex.scraping.GeneralSplitterFactory;
import uk.ac.susx.tag.norconex.scraping.POJOHTMLMatcherDefinition;

/* loaded from: input_file:com/casm/acled/crawler/scraper/ACLEDTagger.class */
public class ACLEDTagger {
    protected static final Logger logger = LoggerFactory.getLogger(ACLEDTagger.class);
    String xmlstr;
    public static final String JOB_JSON = "job.json";
    public static final String ARTICLE = "field.name/article";
    public static final String TITLE = "field.name/title";
    public static final String DATE = "field.name/date";
    private final Source source;
    private final Path scraperPath;

    public ACLEDTagger(Path path, Source source) {
        this.scraperPath = (source.hasValue("CRAWL_SCRAPER_PATH") ? Paths.get((String) source.get("CRAWL_SCRAPER_PATH"), new String[0]) : path.resolve(Util.getID(source))).resolve("job.json");
        this.source = source;
        if (Files.notExists(this.scraperPath, new LinkOption[0])) {
            throw new ScraperNotFoundException(this.scraperPath + " doesn't exist");
        }
    }

    public ACLEDTagger(String str) {
        this.scraperPath = Paths.get(str, new String[0]).resolve("job.json");
        this.source = null;
    }

    public static Map<String, List<Map<String, String>>> buildScraperDefinition(List<POJOHTMLMatcherDefinition> list) {
        HashMap hashMap = new HashMap();
        for (POJOHTMLMatcherDefinition pOJOHTMLMatcherDefinition : list) {
            hashMap.put(pOJOHTMLMatcherDefinition.field, pOJOHTMLMatcherDefinition.getTagDefinitions());
        }
        return hashMap;
    }

    private Map<String, List<Map<String, String>>> getDef() {
        try {
            return buildScraperDefinition(GeneralSplitterFactory.parseJsonTagSet(Util.processJSON(this.scraperPath.toFile())));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private void loadRule(Source source, Map<String, List<Map<String, String>>> map, DOMTagger dOMTagger, String str, String str2, String str3) {
        if (source.hasValue(str)) {
            addDOMDetailsSingleFromQuery((String) source.get(str), str2, str3, dOMTagger);
        } else {
            addDOMDetailsSingle(map, str2, str3, dOMTagger);
        }
    }

    public DOMTagger get() {
        DOMTagger dOMTagger = new DOMTagger();
        Map<String, List<Map<String, String>>> def = getDef();
        loadRule(this.source, def, dOMTagger, "SCRAPER_RULE_ARTICLE", "field.name/article", ScraperFields.SCRAPED_ARTICLE);
        loadRule(this.source, def, dOMTagger, "SCRAPER_RULE_TITLE", "field.name/title", ScraperFields.SCRAPED_TITLE);
        loadRule(this.source, def, dOMTagger, "SCRAPER_RULE_DATE", "field.name/date", ScraperFields.SCRAPED_DATE);
        return dOMTagger;
    }

    public static String constructRoot(Map<String, List<Map<String, String>>> map) {
        String str = "";
        for (Map<String, String> map2 : map.get("root/root")) {
            if (map2.containsKey("tag") || map2.containsKey("custom")) {
                if (str.equals("")) {
                    if (map2.containsKey("tag")) {
                        str = str + map2.get("tag");
                    }
                    if (map2.containsKey("custom")) {
                        str = str + map2.get("custom");
                    }
                } else {
                    if (map2.containsKey("tag")) {
                        str = str + " " + map2.get("tag") + "";
                    }
                    if (map2.containsKey("custom")) {
                        str = str + " " + map2.get("custom") + "";
                    }
                }
                if (map2.containsKey("class")) {
                    str = str + "." + map2.get("class");
                }
            }
        }
        return str;
    }

    public void addDOMDetailsAll(Map<String, List<Map<String, String>>> map, DOMTagger dOMTagger) {
        dOMTagger.setParser("xml");
        String constructRoot = constructRoot(map);
        if (!constructRoot.equals("")) {
            constructRoot = constructRoot + " ";
        }
        for (Map.Entry<String, List<Map<String, String>>> entry : map.entrySet()) {
            String key = entry.getKey();
            String str = "";
            for (Map<String, String> map2 : entry.getValue()) {
                if (map2.containsKey("tag") || map2.containsKey("custom")) {
                    if (str.equals("")) {
                        if (map2.containsKey("tag")) {
                            str = str + map2.get("tag");
                        }
                        if (map2.containsKey("custom")) {
                            str = str + map2.get("custom");
                        }
                    } else {
                        if (map2.containsKey("tag")) {
                            str = str + " " + map2.get("tag") + "";
                        }
                        if (map2.containsKey("custom")) {
                            str = str + " " + map2.get("custom") + "";
                        }
                    }
                    if (map2.containsKey("class")) {
                        str = str + "." + map2.get("class");
                    }
                }
            }
            if (key.equals("field.name/article")) {
                dOMTagger.addDOMExtractDetails(new DOMTagger.DOMExtractDetails(constructRoot + str + "", ScraperFields.SCRAPED_ARTICLE, true, "text"));
            } else if (key.equals("field.name/title")) {
                dOMTagger.addDOMExtractDetails(new DOMTagger.DOMExtractDetails(constructRoot + str, ScraperFields.SCRAPED_TITLE, true, "text"));
            } else if (key.equals("field.name/date")) {
                dOMTagger.addDOMExtractDetails(new DOMTagger.DOMExtractDetails(constructRoot + str, ScraperFields.SCRAPED_DATE, true, "text"));
            }
        }
    }

    public void addDOMDetailsSingle(Map<String, List<Map<String, String>>> map, String str, String str2, DOMTagger dOMTagger) {
        String constructRoot = constructRoot(map);
        if (!constructRoot.equals("")) {
            constructRoot = constructRoot + " ";
        }
        String str3 = "";
        for (Map<String, String> map2 : map.get(str)) {
            if (map2.containsKey("tag") || map2.containsKey("custom")) {
                if (str3.equals("")) {
                    if (map2.containsKey("tag")) {
                        str3 = str3 + map2.get("tag");
                    }
                    if (map2.containsKey("custom")) {
                        str3 = str3 + map2.get("custom");
                    }
                } else {
                    if (map2.containsKey("tag")) {
                        str3 = str3 + " " + map2.get("tag") + "";
                    }
                    if (map2.containsKey("custom")) {
                        str3 = str3 + " " + map2.get("custom") + "";
                    }
                }
                if (map2.containsKey("class")) {
                    str3 = str3 + "." + map2.get("class");
                }
            }
        }
        dOMTagger.addDOMExtractDetails(new DOMTagger.DOMExtractDetails(constructRoot + str3, str2, true, "text"));
    }

    public void addDOMDetailsSingleFromQuery(String str, String str2, String str3, DOMTagger dOMTagger) {
        dOMTagger.addDOMExtractDetails(new DOMTagger.DOMExtractDetails(str, str3, true, "text"));
    }

    public void testXMLParser(DOMTagger dOMTagger) throws ImporterHandlerException, IOException {
        ImporterMetadata importerMetadata = new ImporterMetadata();
        performTagging(importerMetadata, dOMTagger, this.xmlstr);
        importerMetadata.getString("field.name/article");
        importerMetadata.getString("field.name/title");
        importerMetadata.getString("field.name/date");
    }

    private void performTagging(ImporterMetadata importerMetadata, DOMTagger dOMTagger, String str) throws ImporterHandlerException, IOException {
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(str.getBytes());
        importerMetadata.setString("document.contentType", new String[]{"text/html"});
        dOMTagger.tagDocument("n/a", byteArrayInputStream, importerMetadata, false);
        byteArrayInputStream.close();
    }

    public void setXML() throws IOException {
        this.xmlstr = new String(Files.readAllBytes(Paths.get("/Users/pengqiwei/Downloads/My/PhDs/acled_thing/test.html", new String[0])));
    }

    public static void main(String[] strArr) throws ImporterHandlerException, IOException {
        HashMap hashMap = new HashMap();
        hashMap.put("<script.*?>.*?<\\/script>", "");
        ACLEDTransformer aCLEDTransformer = new ACLEDTransformer(hashMap);
        ACLEDTagger aCLEDTagger = new ACLEDTagger("/Users/pengqiwei/Downloads/My/PhDs/acled_thing/acled-scrapers/24chasabg");
        aCLEDTagger.get();
        DOMTagger dOMTagger = aCLEDTagger.get();
        aCLEDTagger.setXML();
        aCLEDTagger.xmlstr = aCLEDTransformer.transform(aCLEDTagger.xmlstr);
        aCLEDTagger.testXMLParser(dOMTagger);
        ImporterMetadata importerMetadata = new ImporterMetadata();
        aCLEDTagger.performTagging(importerMetadata, dOMTagger, aCLEDTagger.xmlstr);
        String.join(" ", importerMetadata.getStrings("field.name/article"));
        importerMetadata.getString("field.name/article");
        importerMetadata.getString("field.name/title");
        importerMetadata.getString("field.name/date");
    }
}
