package com.norconex.collector.http.pipeline.importer;

import com.norconex.collector.core.CollectorException;
import com.norconex.collector.core.data.store.ICrawlDataStore;
import com.norconex.collector.http.crawler.HttpCrawlerEvent;
import com.norconex.collector.http.data.HttpCrawlData;
import com.norconex.collector.http.data.HttpCrawlState;
import com.norconex.collector.http.doc.HttpDocument;
import com.norconex.collector.http.doc.HttpMetadata;
import com.norconex.collector.http.fetch.HttpFetchResponse;
import com.norconex.collector.http.pipeline.queue.HttpQueuePipeline;
import com.norconex.collector.http.pipeline.queue.HttpQueuePipelineContext;
import com.norconex.collector.http.url.ICanonicalLinkDetector;
import com.norconex.collector.http.url.IURLNormalizer;
import com.norconex.commons.lang.file.ContentType;
import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

/* loaded from: input_file:com/norconex/collector/http/pipeline/importer/HttpImporterPipelineUtil.class */
final class HttpImporterPipelineUtil {
    private static final Logger LOG = LogManager.getLogger(HttpImporterPipelineUtil.class);

    private HttpImporterPipelineUtil() {
    }

    public static void applyMetadataToDocument(HttpDocument httpDocument) {
        if (httpDocument.getContentType() == null) {
            httpDocument.setContentType(ContentType.valueOf(httpDocument.m12getMetadata().getString("collector.content-type")));
            httpDocument.setContentEncoding(httpDocument.m12getMetadata().getString("collector.content-encoding"));
        }
    }

    public static void enhanceHTTPHeaders(HttpMetadata httpMetadata) {
        String objects;
        String mimeType;
        String string = httpMetadata.getString("collector.content-type");
        String string2 = httpMetadata.getString("collector.content-encoding");
        if (StringUtils.isNotBlank(string) && StringUtils.isNotBlank(string2)) {
            return;
        }
        String string3 = httpMetadata.getString(HttpMetadata.HTTP_CONTENT_TYPE);
        if (StringUtils.isBlank(string3)) {
            for (String str : httpMetadata.keySet()) {
                if (StringUtils.endsWith(str, HttpMetadata.HTTP_CONTENT_TYPE)) {
                    string3 = httpMetadata.getString(str);
                }
            }
        }
        if (StringUtils.isNotBlank(string3)) {
            org.apache.http.entity.ContentType parse = org.apache.http.entity.ContentType.parse(string3);
            if (StringUtils.isBlank(string) && (mimeType = parse.getMimeType()) != null) {
                httpMetadata.addString("collector.content-type", new String[]{mimeType});
            }
            if (!StringUtils.isBlank(string2) || (objects = Objects.toString(parse.getCharset(), null)) == null) {
                return;
            }
            httpMetadata.addString("collector.content-encoding", new String[]{objects});
        }
    }

    public static boolean resolveCanonical(HttpImporterPipelineContext httpImporterPipelineContext, boolean z) {
        String detectFromContent;
        if (httpImporterPipelineContext.m29getConfig().isIgnoreCanonicalLinks() || httpImporterPipelineContext.m29getConfig().getCanonicalLinkDetector() == null) {
            return true;
        }
        ICanonicalLinkDetector canonicalLinkDetector = httpImporterPipelineContext.m29getConfig().getCanonicalLinkDetector();
        HttpCrawlData m28getCrawlData = httpImporterPipelineContext.m28getCrawlData();
        String reference = m28getCrawlData.getReference();
        if (z) {
            detectFromContent = canonicalLinkDetector.detectFromMetadata(reference, httpImporterPipelineContext.getMetadata());
        } else {
            try {
                detectFromContent = canonicalLinkDetector.detectFromContent(reference, httpImporterPipelineContext.m27getDocument().getContent(), httpImporterPipelineContext.m27getDocument().getContentType());
            } catch (IOException e) {
                throw new CollectorException("Cannot resolve canonical link from content for: " + reference, e);
            }
        }
        if (!StringUtils.isNotBlank(detectFromContent)) {
            return true;
        }
        String str = detectFromContent;
        IURLNormalizer urlNormalizer = httpImporterPipelineContext.m29getConfig().getUrlNormalizer();
        if (urlNormalizer != null) {
            str = urlNormalizer.normalizeURL(str);
        }
        if (str == null) {
            LOG.info("Canonical URL detected is null after normalization so it will be ignored and its referrer will be processed instead.  Canonical URL: \"" + detectFromContent + "\" Rererrer URL: " + reference);
            return false;
        }
        if (str.equals(reference)) {
            if (!LOG.isDebugEnabled()) {
                return true;
            }
            LOG.debug("Canonical URL detected is the same as document URL. Process normally. URL: " + reference);
            return true;
        }
        if (ArrayUtils.contains(m28getCrawlData.getRedirectTrail(), str)) {
            LOG.warn("Circular reference between redirect and canonical URL detected. Will ignore canonical directive and process URL: \"" + reference + "\". Redirect trail: " + Arrays.toString(m28getCrawlData.getRedirectTrail()));
            return true;
        }
        HttpCrawlData clone = m28getCrawlData.clone();
        clone.setReference(detectFromContent);
        clone.setReferrerReference(reference);
        if (httpImporterPipelineContext.m29getConfig().getURLCrawlScopeStrategy().isInScope(m28getCrawlData.getReference(), detectFromContent)) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Canonical URL detected is different than document URL. Document will be rejected while canonical URL will be queued for processing: " + detectFromContent);
            }
            new HttpQueuePipeline().execute(new HttpQueuePipelineContext(httpImporterPipelineContext.m30getCrawler(), httpImporterPipelineContext.getCrawlDataStore(), clone));
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Canonical URL not in scope: " + detectFromContent);
            }
            clone.setState(HttpCrawlState.REJECTED);
            httpImporterPipelineContext.fireCrawlerEvent("REJECTED_FILTER", clone, httpImporterPipelineContext.m29getConfig().getURLCrawlScopeStrategy());
        }
        m28getCrawlData.setState(HttpCrawlState.REJECTED);
        httpImporterPipelineContext.m30getCrawler().fireCrawlerEvent("REJECTED_NONCANONICAL", m28getCrawlData, canonicalLinkDetector);
        return false;
    }

    public static synchronized void queueRedirectURL(HttpImporterPipelineContext httpImporterPipelineContext, HttpFetchResponse httpFetchResponse, String str) {
        ICrawlDataStore crawlDataStore = httpImporterPipelineContext.getCrawlDataStore();
        HttpCrawlData m28getCrawlData = httpImporterPipelineContext.m28getCrawlData();
        String reference = m28getCrawlData.getReference();
        m28getCrawlData.setState(HttpCrawlState.REDIRECT);
        httpImporterPipelineContext.fireCrawlerEvent(HttpCrawlerEvent.REJECTED_REDIRECTED, m28getCrawlData, new HttpFetchResponse(HttpCrawlState.REDIRECT, httpFetchResponse.getStatusCode(), httpFetchResponse.getReasonPhrase() + " (" + str + ")"));
        boolean z = false;
        if (crawlDataStore.isActive(str)) {
            logRedirectTargetAlreadyHandled("being processed", reference, str);
            return;
        }
        if (crawlDataStore.isQueued(str)) {
            logRedirectTargetAlreadyHandled("queued", reference, str);
            return;
        }
        if (crawlDataStore.isProcessed(str)) {
            if (ArrayUtils.contains(m28getCrawlData.getRedirectTrail(), str)) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Redirect encountered for 3rd time, rejecting: " + str);
                }
                logRedirectTargetAlreadyHandled("processed", reference, str);
                return;
            } else if (HttpImporterPipeline.GOOD_REDIRECTS.contains(str)) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Redirect URL previously processed and was valid, rejecting: " + str);
                }
                logRedirectTargetAlreadyHandled("processed", reference, str);
                return;
            } else {
                z = true;
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Redirect URL encountered a second time, re-queue it again (once) in case it came from a circular reference: " + str);
                }
            }
        }
        HttpCrawlData httpCrawlData = new HttpCrawlData(str, m28getCrawlData.getDepth());
        httpCrawlData.setReferrerReference(m28getCrawlData.getReferrerReference());
        httpCrawlData.setReferrerLinkTag(m28getCrawlData.getReferrerLinkTag());
        httpCrawlData.setReferrerLinkText(m28getCrawlData.getReferrerLinkText());
        httpCrawlData.setReferrerLinkTitle(m28getCrawlData.getReferrerLinkTitle());
        httpCrawlData.setRedirectTrail((String[]) ArrayUtils.add(m28getCrawlData.getRedirectTrail(), reference));
        if (z) {
            crawlDataStore.queue(httpCrawlData);
            return;
        }
        if (httpImporterPipelineContext.m29getConfig().getURLCrawlScopeStrategy().isInScope(m28getCrawlData.getReference(), str)) {
            new HttpQueuePipeline().execute(new HttpQueuePipelineContext(httpImporterPipelineContext.m30getCrawler(), crawlDataStore, httpCrawlData));
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("URL redirect target not in scope: " + str);
            }
            httpCrawlData.setState(HttpCrawlState.REJECTED);
            httpImporterPipelineContext.fireCrawlerEvent("REJECTED_FILTER", httpCrawlData, httpImporterPipelineContext.m29getConfig().getURLCrawlScopeStrategy());
        }
    }

    private static void logRedirectTargetAlreadyHandled(String str, String str2, String str3) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Redirect target URL is already " + str + ": " + str3 + " (from: " + str2 + ").");
        }
    }
}
