package com.norconex.collector.http.crawler;

import com.google.common.base.Objects;
import com.norconex.collector.core.CollectorException;
import com.norconex.collector.core.crawler.AbstractCrawler;
import com.norconex.collector.core.crawler.ICrawler;
import com.norconex.collector.core.data.BaseCrawlData;
import com.norconex.collector.core.data.CrawlState;
import com.norconex.collector.core.data.ICrawlData;
import com.norconex.collector.core.data.store.ICrawlDataStore;
import com.norconex.collector.core.pipeline.importer.ImporterPipelineContext;
import com.norconex.collector.http.data.HttpCrawlData;
import com.norconex.collector.http.doc.HttpDocument;
import com.norconex.collector.http.doc.HttpMetadata;
import com.norconex.collector.http.pipeline.committer.HttpCommitterPipeline;
import com.norconex.collector.http.pipeline.committer.HttpCommitterPipelineContext;
import com.norconex.collector.http.pipeline.importer.HttpImporterPipeline;
import com.norconex.collector.http.pipeline.importer.HttpImporterPipelineContext;
import com.norconex.collector.http.pipeline.queue.HttpQueuePipeline;
import com.norconex.collector.http.pipeline.queue.HttpQueuePipelineContext;
import com.norconex.collector.http.redirect.RedirectStrategyWrapper;
import com.norconex.collector.http.sitemap.ISitemapResolver;
import com.norconex.collector.http.sitemap.SitemapURLAdder;
import com.norconex.commons.lang.url.HttpURL;
import com.norconex.importer.doc.ImporterDocument;
import com.norconex.importer.response.ImporterResponse;
import com.norconex.jef4.status.IJobStatus;
import com.norconex.jef4.status.JobStatusUpdater;
import com.norconex.jef4.suite.JobSuite;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.NumberFormat;
import java.util.Iterator;
import org.apache.commons.collections4.multimap.ArrayListValuedHashMap;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.reflect.FieldUtils;
import org.apache.http.client.HttpClient;
import org.apache.http.client.RedirectStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

/* loaded from: input_file:com/norconex/collector/http/crawler/HttpCrawler.class */
public class HttpCrawler extends AbstractCrawler {
    private static final Logger LOG = LogManager.getLogger(HttpCrawler.class);
    private HttpClient httpClient;
    private ISitemapResolver sitemapResolver;

    public HttpCrawler(HttpCrawlerConfig httpCrawlerConfig) {
        super(httpCrawlerConfig);
    }

    /* renamed from: getCrawlerConfig, reason: merged with bridge method [inline-methods] */
    public HttpCrawlerConfig m4getCrawlerConfig() {
        return super.getCrawlerConfig();
    }

    public HttpClient getHttpClient() {
        return this.httpClient;
    }

    public ISitemapResolver getSitemapResolver() {
        return this.sitemapResolver;
    }

    public void stop(IJobStatus iJobStatus, JobSuite jobSuite) {
        super.stop(iJobStatus, jobSuite);
        if (this.sitemapResolver != null) {
            this.sitemapResolver.stop();
        }
    }

    protected void prepareExecution(JobStatusUpdater jobStatusUpdater, JobSuite jobSuite, ICrawlDataStore iCrawlDataStore, boolean z) {
        logInitializationInformation();
        initializeHTTPClient();
        initializeRedirectionStrategy();
        if (m4getCrawlerConfig().getSitemapResolverFactory() != null) {
            this.sitemapResolver = m4getCrawlerConfig().getSitemapResolverFactory().createSitemapResolver(m4getCrawlerConfig(), z);
        }
        if (z) {
            return;
        }
        queueStartURLs(iCrawlDataStore);
    }

    private void queueStartURLs(ICrawlDataStore iCrawlDataStore) {
        LOG.info(NumberFormat.getNumberInstance().format(0 + queueStartURLsSitemaps(iCrawlDataStore) + queueStartURLsRegular(iCrawlDataStore) + queueStartURLsSeedFiles(iCrawlDataStore) + queueStartURLsProviders(iCrawlDataStore)) + " start URLs identified.");
    }

    private int queueStartURLsRegular(ICrawlDataStore iCrawlDataStore) {
        String[] startURLs = m4getCrawlerConfig().getStartURLs();
        if (startURLs == null) {
            return 0;
        }
        for (String str : startURLs) {
            if (StringUtils.isNotBlank(str)) {
                executeQueuePipeline(new HttpCrawlData(str, 0), iCrawlDataStore);
            } else {
                LOG.debug("Blank start URL encountered, ignoring it.");
            }
        }
        return startURLs.length;
    }

    private int queueStartURLsSeedFiles(ICrawlDataStore iCrawlDataStore) {
        String[] startURLsFiles = m4getCrawlerConfig().getStartURLsFiles();
        if (startURLsFiles == null) {
            return 0;
        }
        int i = 0;
        for (String str : startURLsFiles) {
            LineIterator lineIterator = null;
            try {
                try {
                    FileInputStream fileInputStream = new FileInputStream(str);
                    Throwable th = null;
                    try {
                        try {
                            lineIterator = IOUtils.lineIterator(fileInputStream, StandardCharsets.UTF_8);
                            while (lineIterator.hasNext()) {
                                String trimToNull = StringUtils.trimToNull(lineIterator.nextLine());
                                if (trimToNull != null && !trimToNull.startsWith("#")) {
                                    executeQueuePipeline(new HttpCrawlData(trimToNull, 0), iCrawlDataStore);
                                    i++;
                                }
                            }
                            if (fileInputStream != null) {
                                if (0 != 0) {
                                    try {
                                        fileInputStream.close();
                                    } catch (Throwable th2) {
                                        th.addSuppressed(th2);
                                    }
                                } else {
                                    fileInputStream.close();
                                }
                            }
                            LineIterator.closeQuietly(lineIterator);
                        } finally {
                        }
                    } finally {
                    }
                } catch (Throwable th3) {
                    LineIterator.closeQuietly(lineIterator);
                    throw th3;
                }
            } catch (IOException e) {
                throw new CollectorException("Could not process URLs file: " + str, e);
            }
        }
        return i;
    }

    private int queueStartURLsSitemaps(final ICrawlDataStore iCrawlDataStore) {
        String[] startSitemapURLs = m4getCrawlerConfig().getStartSitemapURLs();
        if (startSitemapURLs == null) {
            return 0;
        }
        ArrayListValuedHashMap arrayListValuedHashMap = new ArrayListValuedHashMap();
        for (String str : startSitemapURLs) {
            arrayListValuedHashMap.put(HttpURL.getRoot(str), str);
        }
        final MutableInt mutableInt = new MutableInt();
        SitemapURLAdder sitemapURLAdder = new SitemapURLAdder() { // from class: com.norconex.collector.http.crawler.HttpCrawler.1
            @Override // com.norconex.collector.http.sitemap.SitemapURLAdder
            public void add(HttpCrawlData httpCrawlData) {
                HttpCrawler.this.executeQueuePipeline(httpCrawlData, iCrawlDataStore);
                mutableInt.increment();
            }
        };
        for (String str2 : arrayListValuedHashMap.keySet()) {
            String[] strArr = (String[]) arrayListValuedHashMap.get(str2).toArray(ArrayUtils.EMPTY_STRING_ARRAY);
            if (this.sitemapResolver != null) {
                this.sitemapResolver.resolveSitemaps(this.httpClient, str2, strArr, sitemapURLAdder, true);
            } else {
                LOG.error("Sitemap resolver is null. Sitemaps defined as start URLs cannot be resolved.");
            }
        }
        return mutableInt.intValue();
    }

    private int queueStartURLsProviders(ICrawlDataStore iCrawlDataStore) {
        IStartURLsProvider[] startURLsProviders = m4getCrawlerConfig().getStartURLsProviders();
        if (startURLsProviders == null) {
            return 0;
        }
        int i = 0;
        for (IStartURLsProvider iStartURLsProvider : startURLsProviders) {
            if (iStartURLsProvider != null) {
                Iterator<String> provideStartURLs = iStartURLsProvider.provideStartURLs();
                while (provideStartURLs.hasNext()) {
                    executeQueuePipeline(new HttpCrawlData(provideStartURLs.next(), 0), iCrawlDataStore);
                    i++;
                }
            }
        }
        return i;
    }

    private void logInitializationInformation() {
        LOG.info(getId() + ": RobotsTxt support: " + (!m4getCrawlerConfig().isIgnoreRobotsTxt()));
        LOG.info(getId() + ": RobotsMeta support: " + (!m4getCrawlerConfig().isIgnoreRobotsMeta()));
        LOG.info(getId() + ": Sitemap support: " + (!m4getCrawlerConfig().isIgnoreSitemap()));
        LOG.info(getId() + ": Canonical links support: " + (!m4getCrawlerConfig().isIgnoreCanonicalLinks()));
        String userAgent = m4getCrawlerConfig().getUserAgent();
        if (!StringUtils.isBlank(userAgent)) {
            LOG.info(getId() + ": User-Agent: " + userAgent);
        } else {
            LOG.info(getId() + ": User-Agent: <None specified>");
            LOG.debug("It is recommended you identify yourself to web sites by specifying a user agent (https://en.wikipedia.org/wiki/User_agent)");
        }
    }

    protected void executeQueuePipeline(ICrawlData iCrawlData, ICrawlDataStore iCrawlDataStore) {
        new HttpQueuePipeline().execute(new HttpQueuePipelineContext(this, iCrawlDataStore, (HttpCrawlData) iCrawlData));
    }

    protected ImporterDocument wrapDocument(ICrawlData iCrawlData, ImporterDocument importerDocument) {
        return new HttpDocument(importerDocument);
    }

    protected void initCrawlData(ICrawlData iCrawlData, ICrawlData iCrawlData2, ImporterDocument importerDocument) {
        HttpCrawlData httpCrawlData = (HttpCrawlData) iCrawlData;
        HttpCrawlData httpCrawlData2 = (HttpCrawlData) iCrawlData2;
        HttpMetadata m12getMetadata = ((HttpDocument) importerDocument).m12getMetadata();
        m12getMetadata.addInt(HttpMetadata.COLLECTOR_DEPTH, new int[]{httpCrawlData.getDepth()});
        metadataAddString(m12getMetadata, HttpMetadata.COLLECTOR_SM_CHANGE_FREQ, httpCrawlData.getSitemapChangeFreq());
        if (httpCrawlData.getSitemapLastMod() != null) {
            m12getMetadata.addLong(HttpMetadata.COLLECTOR_SM_LASTMOD, new long[]{httpCrawlData.getSitemapLastMod().longValue()});
        }
        if (httpCrawlData.getSitemapPriority() != null) {
            m12getMetadata.addFloat(HttpMetadata.COLLECTOR_SM_PRORITY, new float[]{httpCrawlData.getSitemapPriority().floatValue()});
        }
        if (httpCrawlData2 != null && httpCrawlData.getReferrerReference() != null && Objects.equal(httpCrawlData.getReferrerReference(), httpCrawlData2.getReferrerReference())) {
            if (httpCrawlData.getReferrerLinkTag() == null) {
                httpCrawlData.setReferrerLinkTag(httpCrawlData2.getReferrerLinkTag());
            }
            if (httpCrawlData.getReferrerLinkText() == null) {
                httpCrawlData.setReferrerLinkText(httpCrawlData2.getReferrerLinkText());
            }
            if (httpCrawlData.getReferrerLinkTitle() == null) {
                httpCrawlData.setReferrerLinkTitle(httpCrawlData2.getReferrerLinkTitle());
            }
        }
        metadataAddString(m12getMetadata, HttpMetadata.COLLECTOR_REFERRER_REFERENCE, httpCrawlData.getReferrerReference());
        metadataAddString(m12getMetadata, HttpMetadata.COLLECTOR_REFERRER_LINK_TAG, httpCrawlData.getReferrerLinkTag());
        metadataAddString(m12getMetadata, HttpMetadata.COLLECTOR_REFERRER_LINK_TEXT, httpCrawlData.getReferrerLinkText());
        metadataAddString(m12getMetadata, HttpMetadata.COLLECTOR_REFERRER_LINK_TITLE, httpCrawlData.getReferrerLinkTitle());
        if (ArrayUtils.isNotEmpty(httpCrawlData.getRedirectTrail())) {
            m12getMetadata.setString(HttpMetadata.COLLECTOR_REDIRECT_TRAIL, httpCrawlData.getRedirectTrail());
        }
    }

    protected ImporterResponse executeImporterPipeline(ImporterPipelineContext importerPipelineContext) {
        HttpImporterPipelineContext httpImporterPipelineContext = new HttpImporterPipelineContext(importerPipelineContext);
        new HttpImporterPipeline(m4getCrawlerConfig().isKeepDownloads(), importerPipelineContext.isOrphan()).execute(httpImporterPipelineContext);
        return httpImporterPipelineContext.getImporterResponse();
    }

    protected BaseCrawlData createEmbeddedCrawlData(String str, ICrawlData iCrawlData) {
        return new HttpCrawlData(str, ((HttpCrawlData) iCrawlData).getDepth());
    }

    protected void executeCommitterPipeline(ICrawler iCrawler, ImporterDocument importerDocument, ICrawlDataStore iCrawlDataStore, BaseCrawlData baseCrawlData, BaseCrawlData baseCrawlData2) {
        new HttpCommitterPipeline().execute(new HttpCommitterPipelineContext((HttpCrawler) iCrawler, iCrawlDataStore, (HttpDocument) importerDocument, (HttpCrawlData) baseCrawlData, (HttpCrawlData) baseCrawlData2));
    }

    protected void beforeFinalizeDocumentProcessing(BaseCrawlData baseCrawlData, ICrawlDataStore iCrawlDataStore, ImporterDocument importerDocument, ICrawlData iCrawlData) {
        HttpCrawlData httpCrawlData = (HttpCrawlData) baseCrawlData;
        HttpCrawlData httpCrawlData2 = (HttpCrawlData) iCrawlData;
        if (httpCrawlData.getState().isNewOrModified() && ArrayUtils.isNotEmpty(httpCrawlData.getRedirectTrail())) {
            HttpImporterPipeline.GOOD_REDIRECTS.add(httpCrawlData.getReference());
        }
        if (iCrawlData == null || ArrayUtils.isNotEmpty(httpCrawlData.getReferencedUrls()) || ArrayUtils.isEmpty(httpCrawlData2.getReferencedUrls())) {
            return;
        }
        CrawlState state = baseCrawlData.getState();
        if (state.isSkipped() || state.isOneOf(new CrawlState[]{CrawlState.BAD_STATUS, CrawlState.ERROR})) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Queuing referenced URLs of " + baseCrawlData.getReference());
            }
            int depth = httpCrawlData.getDepth() + 1;
            for (String str : httpCrawlData2.getReferencedUrls()) {
                HttpCrawlData httpCrawlData3 = new HttpCrawlData(str, depth);
                httpCrawlData3.setReferrerReference(httpCrawlData.getReference());
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Queueing skipped document's child: " + httpCrawlData3.getReference());
                }
                executeQueuePipeline(httpCrawlData3, iCrawlDataStore);
            }
        }
    }

    protected void markReferenceVariationsAsProcessed(BaseCrawlData baseCrawlData, ICrawlDataStore iCrawlDataStore) {
        HttpCrawlData httpCrawlData = (HttpCrawlData) baseCrawlData;
        String originalReference = httpCrawlData.getOriginalReference();
        String reference = httpCrawlData.getReference();
        if (StringUtils.isNotBlank(originalReference) && ObjectUtils.notEqual(originalReference, reference)) {
            HttpCrawlData clone = httpCrawlData.clone();
            clone.setReference(originalReference);
            clone.setOriginalReference(null);
            iCrawlDataStore.processed(clone);
        }
    }

    protected void cleanupExecution(JobStatusUpdater jobStatusUpdater, JobSuite jobSuite, ICrawlDataStore iCrawlDataStore) {
        try {
            if (this.sitemapResolver != null) {
                this.sitemapResolver.stop();
            }
        } catch (Exception e) {
            LOG.error("Could not stop sitemap store.");
        }
        closeHttpClient();
    }

    private void metadataAddString(HttpMetadata httpMetadata, String str, String str2) {
        if (str2 != null) {
            httpMetadata.addString(str, new String[]{str2});
        }
    }

    private void initializeHTTPClient() {
        this.httpClient = m4getCrawlerConfig().getHttpClientFactory().createHTTPClient(m4getCrawlerConfig().getUserAgent());
    }

    private void initializeRedirectionStrategy() {
        try {
            Object readField = FieldUtils.readField(this.httpClient, "execChain", true);
            Object readField2 = FieldUtils.readField(readField, "redirectStrategy", true);
            if (readField2 instanceof RedirectStrategy) {
                FieldUtils.writeField(readField, "redirectStrategy", new RedirectStrategyWrapper((RedirectStrategy) readField2, m4getCrawlerConfig().getRedirectURLProvider()), true);
            } else {
                LOG.warn("Could not wrap RedirectStrategy to properly handleredirects.");
            }
        } catch (Exception e) {
            LOG.warn("\"maxConnectionInactiveTime\" could not be set since internal connection manager does not support it.");
        }
    }

    private void closeHttpClient() {
        if (this.httpClient instanceof CloseableHttpClient) {
            try {
                this.httpClient.close();
            } catch (IOException e) {
                LOG.error(getId() + " Cannot close HttpClient.", e);
            }
        }
    }
}
