package com.norconex.collector.http.pipeline.importer;

import com.norconex.collector.http.crawler.HttpCrawlerEvent;
import com.norconex.collector.http.data.HttpCrawlData;
import com.norconex.collector.http.doc.HttpMetadata;
import com.norconex.collector.http.pipeline.queue.HttpQueuePipeline;
import com.norconex.collector.http.pipeline.queue.HttpQueuePipelineContext;
import com.norconex.collector.http.url.ILinkExtractor;
import com.norconex.collector.http.url.Link;
import com.norconex.commons.lang.file.ContentType;
import com.norconex.commons.lang.io.CachedInputStream;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.collections4.SetUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

/* JADX INFO: Access modifiers changed from: package-private */
/* loaded from: input_file:com/norconex/collector/http/pipeline/importer/LinkExtractorStage.class */
public class LinkExtractorStage extends AbstractImporterStage {
    private static final Logger LOG = LogManager.getLogger(LinkExtractorStage.class);

    @Override // com.norconex.collector.http.pipeline.importer.AbstractImporterStage
    public boolean executeStage(HttpImporterPipelineContext httpImporterPipelineContext) {
        Set<Link> extractLinks = extractLinks(httpImporterPipelineContext);
        if (extractLinks.isEmpty()) {
            return true;
        }
        String reference = httpImporterPipelineContext.m28getCrawlData().getReference();
        HashSet hashSet = new HashSet();
        HashSet hashSet2 = new HashSet();
        HashSet hashSet3 = new HashSet();
        if (extractLinks != null) {
            for (Link link : extractLinks) {
                try {
                    if (httpImporterPipelineContext.m29getConfig().getURLCrawlScopeStrategy().isInScope(reference, link.getUrl())) {
                        String queueURL = queueURL(link, httpImporterPipelineContext, hashSet);
                        if (StringUtils.isNotBlank(queueURL)) {
                            hashSet2.add(queueURL);
                        }
                    } else {
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("URL not in crawl scope: " + link.getUrl() + " (keep: " + httpImporterPipelineContext.m29getConfig().isKeepOutOfScopeLinks() + ")");
                        }
                        if (httpImporterPipelineContext.m29getConfig().isKeepOutOfScopeLinks()) {
                            hashSet3.add(link.getUrl());
                        }
                    }
                } catch (Exception e) {
                    LOG.warn("Could not queue extracted URL \"" + link.getUrl() + "\".", e);
                }
            }
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("uniqueQueuedURLs count: " + hashSet2.size() + ".");
        }
        if (!hashSet2.isEmpty()) {
            String[] strArr = (String[]) hashSet2.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
            httpImporterPipelineContext.getMetadata().addString(HttpMetadata.COLLECTOR_REFERENCED_URLS, strArr);
            httpImporterPipelineContext.m28getCrawlData().setReferencedUrls(strArr);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("uniqueOutOfScopeURLs count: " + hashSet3.size() + ".");
        }
        if (!hashSet3.isEmpty()) {
            httpImporterPipelineContext.getMetadata().addString(HttpMetadata.COLLECTOR_REFERENCED_URLS_OUT_OF_SCOPE, (String[]) hashSet3.toArray(ArrayUtils.EMPTY_STRING_ARRAY));
        }
        httpImporterPipelineContext.fireCrawlerEvent(HttpCrawlerEvent.URLS_EXTRACTED, httpImporterPipelineContext.m28getCrawlData(), hashSet2);
        return true;
    }

    private Set<Link> extractLinks(HttpImporterPipelineContext httpImporterPipelineContext) {
        String reference = httpImporterPipelineContext.m28getCrawlData().getReference();
        ILinkExtractor[] linkExtractors = httpImporterPipelineContext.m29getConfig().getLinkExtractors();
        if (ArrayUtils.isEmpty(linkExtractors)) {
            LOG.debug("No configured link extractor.  No links will be detected.");
            return SetUtils.emptySet();
        }
        if (httpImporterPipelineContext.getRobotsMeta() != null && httpImporterPipelineContext.getRobotsMeta().isNofollow()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("No URLs extracted due to Robots nofollow rule for URL: " + reference);
            }
            return SetUtils.emptySet();
        }
        HashSet hashSet = new HashSet();
        CachedInputStream content = httpImporterPipelineContext.getContent();
        ContentType contentType = httpImporterPipelineContext.m27getDocument().getContentType();
        for (ILinkExtractor iLinkExtractor : linkExtractors) {
            if (iLinkExtractor.accepts(reference, contentType)) {
                try {
                    try {
                        Set<Link> extractLinks = iLinkExtractor.extractLinks(content, reference, contentType);
                        if (extractLinks != null) {
                            hashSet.addAll(extractLinks);
                        }
                        content.rewind();
                    } catch (Exception e) {
                        LOG.error("Could not extract links from: " + reference, e);
                        content.rewind();
                    }
                } catch (Throwable th) {
                    content.rewind();
                    throw th;
                }
            }
        }
        return hashSet;
    }

    private String queueURL(Link link, HttpImporterPipelineContext httpImporterPipelineContext, Set<String> set) {
        if (!set.add(link.getUrl())) {
            return null;
        }
        HttpCrawlData httpCrawlData = new HttpCrawlData(link.getUrl(), httpImporterPipelineContext.m28getCrawlData().getDepth() + 1);
        httpCrawlData.setReferrerReference(link.getReferrer());
        httpCrawlData.setReferrerLinkTag(link.getTag());
        httpCrawlData.setReferrerLinkText(link.getText());
        httpCrawlData.setReferrerLinkTitle(link.getTitle());
        new HttpQueuePipeline().execute(new HttpQueuePipelineContext(httpImporterPipelineContext.m30getCrawler(), httpImporterPipelineContext.getCrawlDataStore(), httpCrawlData));
        String reference = httpCrawlData.getReference();
        if (LOG.isDebugEnabled() && !link.getUrl().equals(reference)) {
            LOG.debug("URL modified from \"" + link.getUrl() + "\" to \"" + reference);
        }
        return reference;
    }
}
