package com.casm.acled.crawler;

import com.casm.acled.crawler.management.CrawlArgs;
import com.casm.acled.crawler.management.NorconexConfiguration;
import com.casm.acled.crawler.reporting.Reporter;
import com.casm.acled.crawler.scraper.ACLEDCommitter;
import com.casm.acled.crawler.scraper.ACLEDMetadataPreProcessor;
import com.casm.acled.crawler.scraper.ACLEDScraper;
import com.casm.acled.crawler.scraper.ACLEDTagger;
import com.casm.acled.crawler.scraper.ACLEDTransformer;
import com.casm.acled.crawler.scraper.AcceptFilter;
import com.casm.acled.crawler.scraper.ScraperFields;
import com.casm.acled.crawler.scraper.dates.CompositeDateParser;
import com.casm.acled.crawler.scraper.dates.DateTagger;
import com.casm.acled.crawler.scraper.dates.ExcludingCustomDateMetadataFilter;
import com.casm.acled.crawler.scraper.dates.SiteMapLastModifiedMetadataFilter;
import com.casm.acled.crawler.scraper.keywords.ExcludingKeywordFilter;
import com.casm.acled.crawler.scraper.keywords.KeywordTagger;
import com.casm.acled.entities.source.Source;
import com.casm.acled.entities.sourcelist.SourceList;
import com.norconex.collector.core.filter.IMetadataFilter;
import com.norconex.collector.core.filter.IReferenceFilter;
import com.norconex.collector.core.filter.impl.RegexReferenceFilter;
import com.norconex.collector.http.HttpCollector;
import com.norconex.collector.http.delay.impl.GenericDelayResolver;
import com.norconex.collector.http.robot.impl.StandardRobotsTxtProvider;
import com.norconex.collector.http.url.IURLNormalizer;
import com.norconex.collector.http.url.impl.GenericURLNormalizer;
import com.norconex.importer.handler.IImporterHandler;
import com.norconex.importer.handler.filter.OnMatch;
import com.norconex.importer.handler.filter.impl.DateMetadataFilter;
import com.norconex.importer.handler.filter.impl.EmptyMetadataFilter;
import com.norconex.importer.handler.tagger.impl.DOMTagger;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Date;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.function.Supplier;
import java.util.regex.Pattern;
import org.apache.http.impl.client.HttpClientBuilder;

/* loaded from: input_file:com/casm/acled/crawler/Crawl.class */
public class Crawl {
    private final Source source;
    public static final String SOURCE_ID = "SOURCE_ID";
    public static final String SOURCE_LIST_ID = "SOURCE_LIST_ID";
    public static final String SKIP_KEYWORD_FILTER = "SKIP_KEYWORD_FILTER";
    public static final String FROM = "FROM";
    public static final String TO = "TO";
    public static final String ARTICLE_LIMIT = "ARTICLE_LIMIT";
    public static final String DEPTH_LIMIT = "DEPTH_LIMIT";
    private final LocalDate from;
    private final LocalDate to;
    private final NorconexConfiguration config;
    private HttpCollector collector;
    private Supplier<HttpCollector> collectorSupplier = () -> {
        return this.collector;
    };
    private final Reporter reporter;

    /* loaded from: input_file:com/casm/acled/crawler/Crawl$RootLogAppenderClearingURLNormaliser.class */
    private class RootLogAppenderClearingURLNormaliser implements IURLNormalizer {
        private final GenericURLNormalizer genericURLNormalizer = new GenericURLNormalizer();

        public RootLogAppenderClearingURLNormaliser() {
        }

        public String normalizeURL(String str) {
            return this.genericURLNormalizer.normalizeURL(str);
        }
    }

    public Crawl(CrawlArgs crawlArgs, ACLEDCommitter aCLEDCommitter, Reporter reporter, List<String> list) {
        this.source = crawlArgs.source;
        this.from = crawlArgs.from;
        this.to = crawlArgs.to;
        this.reporter = reporter;
        String id = id(false);
        Path path = Paths.get(id, new String[0]);
        aCLEDCommitter.setCollectorSupplier(this.collectorSupplier);
        aCLEDCommitter.setMaxArticles(crawlArgs.maxArticle);
        Path path2 = crawlArgs.workingDir;
        crawlArgs.ignoreSiteMap = true;
        this.config = new NorconexConfiguration(path2.resolve(path), crawlArgs);
        this.config.crawler().setUrlNormalizer(new RootLogAppenderClearingURLNormaliser());
        if (new StandardRobotsTxtProvider().getRobotsTxt(HttpClientBuilder.create().build(), (String) this.source.get("LINK"), "CASM Tech").getCrawlDelay() > 100.0f) {
            GenericDelayResolver genericDelayResolver = new GenericDelayResolver();
            genericDelayResolver.setDefaultDelay(this.config.getPoliteness() <= 50 ? 50L : this.config.getPoliteness());
            genericDelayResolver.setIgnoreRobotsCrawlDelay(true);
            genericDelayResolver.setScope("site");
            this.config.crawler().setDelayResolver(genericDelayResolver);
        }
        ArrayList arrayList = new ArrayList();
        if (this.source.isFalse("CRAWL_DISABLE_SITEMAPS")) {
            if (this.source.isFalse("CRAWL_DISABLE_SITEMAP_DISCOVERY")) {
                arrayList.addAll(list);
            }
            if (this.source.hasValue("CRAWL_SITEMAP_LOCATIONS")) {
                arrayList.addAll((Collection) this.source.get("CRAWL_SITEMAP_LOCATIONS"));
            }
            this.config.crawler().setStartSitemapURLs((String[]) arrayList.toArray(new String[0]));
        }
        ArrayList arrayList2 = new ArrayList();
        ArrayList arrayList3 = new ArrayList();
        arrayList2.add(new AcceptFilter());
        arrayList2.add(new EmptyMetadataFilter(OnMatch.EXCLUDE, new String[]{ScraperFields.SCRAPED_ARTICLE, ScraperFields.SCRAPED_DATE}));
        if (this.from != null && this.to != null) {
            String str = (String) this.source.get("TIMEZONE");
            ZoneId systemDefault = str == null ? ZoneId.systemDefault() : ZoneId.of(str);
            arrayList3.add(dateTagger(this.source, ZonedDateTime.of(this.from.atTime(0, 0, 0), systemDefault), ZonedDateTime.of(this.to.atTime(0, 0, 0), systemDefault)));
        }
        if (!crawlArgs.skipKeywords.booleanValue()) {
            arrayList3.add(keywordTagger(crawlArgs.sourceLists.get(0), this.source));
        }
        NorconexConfiguration norconexConfiguration = this.config;
        norconexConfiguration.getClass();
        arrayList2.forEach(norconexConfiguration::addFilter);
        List list2 = (List) this.source.get("SEED_URLS");
        String[] split = (list2 == null || list2.isEmpty()) ? ((String) this.source.get("LINK")).split(",") : (String[]) list2.toArray(new String[0]);
        if ("norconex".equals("scraper")) {
            this.config.setScraper(ACLEDScraper.load(crawlArgs.scrapersDir, this.source, reporter), new ACLEDMetadataPreProcessor(split[0]));
            arrayList3.addAll(arrayList2);
            this.config.importer().setPostParseHandlers((IImporterHandler[]) arrayList3.toArray(new IImporterHandler[arrayList3.size()]));
        }
        if ("norconex".equals("norconex")) {
            DOMTagger dOMTagger = new ACLEDTagger(crawlArgs.scrapersDir, this.source).get();
            HashMap hashMap = new HashMap();
            hashMap.put("<script.*?>.*?<\\/script>", "");
            new ACLEDTransformer(hashMap);
            arrayList3.add(0, ACLEDTransformer.transformer);
            arrayList3.add(1, dOMTagger);
            arrayList3.addAll(arrayList2);
            this.config.importer().setPostParseHandlers((IImporterHandler[]) arrayList3.toArray(new IImporterHandler[arrayList3.size()]));
        }
        if (this.source.hasValue("CRAWL_EXCLUDE_PATTERN")) {
            this.config.crawler().setReferenceFilters(new IReferenceFilter[]{new RegexReferenceFilter((String) this.source.get("CRAWL_EXCLUDE_PATTERN"), OnMatch.EXCLUDE)});
        }
        if (!crawlArgs.ignoreSiteMap.booleanValue() && this.from != null) {
            this.config.crawler().setMetadataFilters(new IMetadataFilter[]{new SiteMapLastModifiedMetadataFilter(this.from)});
        }
        applySourceIdiosyncrasies(this.source, this.config);
        this.config.crawler().setRecrawlableResolver(new DontRecrawlResolver(split, this.source.hasValue("CRAWL_RECRAWL_PATTERN") ? Pattern.compile((String) this.source.get("CRAWL_RECRAWL_PATTERN")) : null));
        this.config.crawler().setMaxDepth(crawlArgs.depth.intValue());
        this.config.crawler().setStartURLs(split);
        if (crawlArgs.crawlId == null || crawlArgs.crawlId.isEmpty()) {
            this.config.setId(id);
        } else {
            this.config.setId(crawlArgs.crawlId);
        }
        this.config.crawler().setCommitter(aCLEDCommitter);
    }

    public NorconexConfiguration getConfig() {
        return this.config;
    }

    private ExcludingKeywordFilter keywordFilter(SourceList sourceList, Source source) {
        return new ExcludingKeywordFilter(ScraperFields.SCRAPED_ARTICLE, resolveQuery(sourceList, source));
    }

    private KeywordTagger keywordTagger(SourceList sourceList, Source source) {
        return new KeywordTagger(ScraperFields.SCRAPED_ARTICLE, resolveQuery(sourceList, source));
    }

    private DateMetadataFilter dateFilter(Source source, ZonedDateTime zonedDateTime, ZonedDateTime zonedDateTime2) {
        ExcludingCustomDateMetadataFilter excludingCustomDateMetadataFilter = new ExcludingCustomDateMetadataFilter(source, ScraperFields.SCRAPED_DATE, CompositeDateParser.of((List) source.get("DATE_FORMAT")), this.reporter);
        excludingCustomDateMetadataFilter.addCondition(DateMetadataFilter.Operator.GREATER_EQUAL, Date.from(zonedDateTime.toInstant()));
        excludingCustomDateMetadataFilter.addCondition(DateMetadataFilter.Operator.LOWER_EQUAL, Date.from(zonedDateTime2.toInstant()));
        return excludingCustomDateMetadataFilter;
    }

    private DateTagger dateTagger(Source source, ZonedDateTime zonedDateTime, ZonedDateTime zonedDateTime2) {
        DateTagger dateTagger = new DateTagger(source, ScraperFields.SCRAPED_DATE, CompositeDateParser.of((List) source.get("DATE_FORMAT")), this.reporter);
        dateTagger.setFromTime(zonedDateTime);
        dateTagger.setToTime(zonedDateTime2);
        return dateTagger;
    }

    private void applySourceIdiosyncrasies(Source source, NorconexConfiguration norconexConfiguration) {
    }

    public String id(boolean z) {
        return id(this.source, this.from, this.to, z);
    }

    public static String id(Source source) {
        return id(source, null, null, false);
    }

    public static String id(Source source, LocalDate localDate, LocalDate localDate2, boolean z) {
        StringBuilder sb = new StringBuilder();
        sb.append(((String) source.get("STANDARD_NAME")).toLowerCase().replaceAll(" ", "-"));
        if (z) {
            sb.append("-").append(localDate == null ? "" : localDate.toString()).append("-").append(localDate2 == null ? "" : localDate2.toString());
        }
        return sb.toString();
    }

    private String resolveQuery(SourceList sourceList, Source source) {
        return (String) sourceList.get("KEYWORDS");
    }

    public void run() {
        this.collector = new HttpCollector(this.config.collector());
        this.collector.start(true);
    }
}
