package com.casm.acled.crawler.management;

import com.casm.acled.crawler.scraper.ACLEDMetadataPreProcessor;
import com.casm.acled.crawler.scraper.ACLEDScraper;
import com.norconex.collector.core.crawler.ICrawlerConfig;
import com.norconex.collector.core.data.store.impl.mvstore.MVStoreCrawlDataStoreFactory;
import com.norconex.collector.core.filter.IReferenceFilter;
import com.norconex.collector.core.filter.impl.ExtensionReferenceFilter;
import com.norconex.collector.http.HttpCollectorConfig;
import com.norconex.collector.http.crawler.HttpCrawlerConfig;
import com.norconex.collector.http.crawler.URLCrawlScopeStrategy;
import com.norconex.collector.http.delay.impl.GenericDelayResolver;
import com.norconex.collector.http.processor.IHttpDocumentProcessor;
import com.norconex.collector.http.sitemap.impl.StandardSitemapResolverFactory;
import com.norconex.collector.http.url.ILinkExtractor;
import com.norconex.collector.http.url.impl.GenericLinkExtractor;
import com.norconex.collector.http.url.impl.XMLFeedLinkExtractor;
import com.norconex.importer.ImporterConfig;
import com.norconex.importer.handler.IImporterHandler;
import com.norconex.importer.handler.filter.AbstractDocumentFilter;
import com.norconex.importer.handler.filter.OnMatch;
import com.norconex.importer.parser.GenericDocumentParserFactory;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.FileAttribute;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.List;

/* loaded from: input_file:com/casm/acled/crawler/management/NorconexConfiguration.class */
public class NorconexConfiguration {
    private Path workingDir;
    private boolean ignoreSiteMap;
    private String urlRegex;
    private List<String> regexFilterPatterns;
    private static String PROGRESS = "progress";
    private static String LOGS = "logs";
    private final CrawlArgs args;
    private String userAgent = "CASM Consulting LLP";
    private int numThreads = 3;
    private boolean ignoreRobots = false;
    private int depth = 5;
    private long politeness = 100;
    private final List<AbstractDocumentFilter> filters = new ArrayList();
    private final ImporterConfig importer = new ImporterConfig();
    private final HttpCollectorConfig collector = new HttpCollectorConfig();
    private final HttpCrawlerConfig crawler = new HttpCrawlerConfig();

    public NorconexConfiguration(Path path, CrawlArgs crawlArgs) {
        this.args = crawlArgs;
        this.ignoreSiteMap = crawlArgs.ignoreSiteMap.booleanValue();
        this.workingDir = path;
        configureImporter();
        configureCrawler();
        configureCollector();
    }

    public HttpCollectorConfig collector() {
        return this.collector;
    }

    public HttpCrawlerConfig crawler() {
        return this.crawler;
    }

    public ImporterConfig importer() {
        return this.importer;
    }

    public void setId(String str) {
        this.collector.setId(str);
        this.crawler.setId(str);
    }

    public Path getWorkingDir() {
        return this.workingDir;
    }

    private void configureCollector() {
        this.collector.setCrawlerConfigs(new ICrawlerConfig[]{this.crawler});
        this.collector.setProgressDir(this.workingDir.resolve(PROGRESS).toString());
        this.collector.setLogsDir(this.workingDir.resolve(LOGS).toString());
        try {
            Files.createDirectories(Paths.get(this.collector.getLogsDir(), new String[0]), new FileAttribute[0]);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private void configureCrawler() {
        this.crawler.setLinkExtractorQuitAtDepth(true);
        this.crawler.setImporterConfig(this.importer);
        this.crawler.setUserAgent(this.userAgent);
        this.crawler.setOrphansStrategy(ICrawlerConfig.OrphansStrategy.IGNORE);
        this.crawler.setIgnoreRobotsMeta(this.ignoreRobots);
        this.crawler.setIgnoreRobotsTxt(this.ignoreRobots);
        this.crawler.setIgnoreCanonicalLinks(false);
        this.crawler.setIgnoreSitemap(this.ignoreSiteMap);
        this.crawler.setNumThreads(this.numThreads);
        this.crawler.setKeepDownloads(false);
        this.crawler.setOrphansStrategy(ICrawlerConfig.OrphansStrategy.PROCESS);
        URLCrawlScopeStrategy uRLCrawlScopeStrategy = new URLCrawlScopeStrategy();
        uRLCrawlScopeStrategy.setStayOnDomain(true);
        uRLCrawlScopeStrategy.setIncludeSubdomains(true);
        uRLCrawlScopeStrategy.setStayOnPort(false);
        uRLCrawlScopeStrategy.setStayOnProtocol(false);
        this.crawler.setUrlCrawlScopeStrategy(uRLCrawlScopeStrategy);
        this.crawler.setKeepOutOfScopeLinks(false);
        this.crawler.setWorkDir(this.workingDir.toFile());
        this.crawler.setCrawlDataStoreFactory(new MVStoreCrawlDataStoreFactory());
        IReferenceFilter extensionReferenceFilter = new ExtensionReferenceFilter("jpeg,jpg,png,pdf,ico,mpg,mp4,avi,mp3,mov,dvi,gif,tiff,bmp,wav");
        extensionReferenceFilter.setOnMatch(OnMatch.EXCLUDE);
        this.crawler.setReferenceFilters(new IReferenceFilter[]{extensionReferenceFilter});
        StandardSitemapResolverFactory standardSitemapResolverFactory = new StandardSitemapResolverFactory();
        if (this.args.from != null) {
            standardSitemapResolverFactory.setFrom(this.args.from.atStartOfDay(ZoneId.systemDefault()).toInstant().toEpochMilli() - 86400000);
        }
        standardSitemapResolverFactory.setLenient(true);
        standardSitemapResolverFactory.setEscalateErrors(true);
        this.crawler.setSitemapResolverFactory(standardSitemapResolverFactory);
        GenericDelayResolver genericDelayResolver = new GenericDelayResolver();
        genericDelayResolver.setDefaultDelay(this.politeness <= 50 ? 50L : this.politeness);
        genericDelayResolver.setIgnoreRobotsCrawlDelay(this.ignoreRobots);
        genericDelayResolver.setScope("site");
        this.crawler.setDelayResolver(genericDelayResolver);
        ILinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        genericLinkExtractor.setIgnoreNofollow(this.ignoreRobots);
        genericLinkExtractor.setCharset(StandardCharsets.UTF_8.toString());
        this.crawler.setLinkExtractors(new ILinkExtractor[]{genericLinkExtractor, new XMLFeedLinkExtractor()});
    }

    public NorconexConfiguration addFilter(AbstractDocumentFilter abstractDocumentFilter) {
        this.filters.add(abstractDocumentFilter);
        return this;
    }

    public NorconexConfiguration finalise() {
        this.importer.setPostParseHandlers((IImporterHandler[]) this.filters.toArray(new IImporterHandler[this.filters.size()]));
        return this;
    }

    public void setScraper(ACLEDScraper aCLEDScraper, ACLEDMetadataPreProcessor aCLEDMetadataPreProcessor) {
        this.crawler.setPreImportProcessors(new IHttpDocumentProcessor[]{aCLEDScraper, aCLEDMetadataPreProcessor});
    }

    public long getPoliteness() {
        return this.politeness;
    }

    private void configureImporter() {
        GenericDocumentParserFactory genericDocumentParserFactory = new GenericDocumentParserFactory();
        genericDocumentParserFactory.setIgnoredContentTypesRegex(".*");
        this.importer.setParserFactory(genericDocumentParserFactory);
        this.importer.setTempDir(this.workingDir.toFile());
    }
}
