package com.norconex.collector.http.sitemap.impl;

import com.norconex.collector.http.client.impl.GenericHttpClientFactory;
import com.norconex.collector.http.data.HttpCrawlData;
import com.norconex.collector.http.doc.HttpMetadata;
import com.norconex.collector.http.filter.impl.SegmentCountURLFilter;
import com.norconex.collector.http.sitemap.ISitemapResolver;
import com.norconex.collector.http.sitemap.SitemapURLAdder;
import com.norconex.commons.lang.file.FileUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.joda.time.DateTime;

/* loaded from: input_file:com/norconex/collector/http/sitemap/impl/StandardSitemapResolver.class */
public class StandardSitemapResolver implements ISitemapResolver {
    private static final Logger LOG = LogManager.getLogger(StandardSitemapResolver.class);
    public static final String[] DEFAULT_SITEMAP_PATHS = {"/sitemap.xml", "/sitemap_index.xml"};
    private final SitemapStore sitemapStore;
    private boolean lenient;
    private boolean stopped;
    private File tempDir;
    private final Set<String> activeURLRoots = Collections.synchronizedSet(new HashSet());
    private String[] sitemapPaths = DEFAULT_SITEMAP_PATHS;
    private long from = -1;
    private boolean escalateErrors = false;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/norconex/collector/http/sitemap/impl/StandardSitemapResolver$ParseState.class */
    public static class ParseState {
        private HttpCrawlData baseURL;
        private boolean sitemapIndex;
        private boolean loc;
        private boolean lastmod;
        private boolean changefreq;
        private boolean priority;

        private ParseState() {
            this.baseURL = null;
            this.sitemapIndex = false;
            this.loc = false;
            this.lastmod = false;
            this.changefreq = false;
            this.priority = false;
        }
    }

    public StandardSitemapResolver(File file, SitemapStore sitemapStore) {
        this.tempDir = file;
        this.sitemapStore = sitemapStore;
    }

    public String[] getSitemapPaths() {
        return this.sitemapPaths;
    }

    public void setSitemapPaths(String... strArr) {
        this.sitemapPaths = (String[]) ArrayUtils.clone(strArr);
    }

    @Override // com.norconex.collector.http.sitemap.ISitemapResolver
    public void resolveSitemaps(HttpClient httpClient, String str, String[] strArr, SitemapURLAdder sitemapURLAdder, boolean z) {
        Set<String> combineLocations;
        if (isResolutionRequired(str)) {
            HashSet hashSet = new HashSet();
            if (z) {
                combineLocations = new HashSet();
                combineLocations.addAll(Arrays.asList(strArr));
            } else {
                combineLocations = combineLocations(strArr, str);
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("Sitemap locations: " + combineLocations);
            }
            Iterator<String> it = combineLocations.iterator();
            while (it.hasNext()) {
                resolveLocation(it.next(), httpClient, sitemapURLAdder, hashSet);
            }
            this.sitemapStore.markResolved(str);
            this.activeURLRoots.remove(str);
        }
    }

    private synchronized boolean isResolutionRequired(String str) {
        if (this.activeURLRoots.contains(str) || this.sitemapStore.isResolved(str)) {
            LOG.trace("Sitemap locations were already processed or are being processed for URL root: " + str);
            return false;
        }
        this.activeURLRoots.add(str);
        return true;
    }

    @Deprecated
    public String[] getSitemapLocations() {
        LOG.warn("Since 2.3.0, calling StandardSitemapResolver#getSitemapLocation() has no effect. Use HttpCrawlerConfig#getSitemaps() instead.");
        return null;
    }

    public void setSitemapLocations(String... strArr) {
        LOG.warn("Since 2.3.0, calling StandardSitemapResolver#setSitemapLocation(String...) has no effect. Use HttpCrawlerConfig#setSitemaps(String[] ...) instead.");
    }

    public boolean isLenient() {
        return this.lenient;
    }

    public void setLenient(boolean z) {
        this.lenient = z;
    }

    public long getFrom() {
        return this.from;
    }

    public void setFrom(long j) {
        this.from = j;
    }

    public boolean isEscalateErrors() {
        return this.escalateErrors;
    }

    public void setEscalateErrors(boolean z) {
        this.escalateErrors = z;
    }

    public File getTempDir() {
        return this.tempDir;
    }

    public void setTempDir(File file) {
        this.tempDir = file;
    }

    @Override // com.norconex.collector.http.sitemap.ISitemapResolver
    public void stop() {
        this.stopped = true;
        this.sitemapStore.close();
    }

    private void resolveLocation(String str, HttpClient httpClient, SitemapURLAdder sitemapURLAdder, Set<String> set) {
        if (set.contains(str)) {
            return;
        }
        if (this.stopped) {
            LOG.debug("Skipping resolution of sitemap location (stop requested): " + str);
            return;
        }
        HttpGet httpGet = null;
        try {
            try {
                try {
                    HttpGet httpGet2 = new HttpGet(str);
                    HttpResponse execute = httpClient.execute(httpGet2);
                    int statusCode = execute.getStatusLine().getStatusCode();
                    if (statusCode == 200) {
                        LOG.info("Resolving sitemap: " + str);
                        InputStream content = execute.getEntity().getContent();
                        if (execute.getFirstHeader(HttpMetadata.HTTP_CONTENT_TYPE).getValue().endsWith("gzip")) {
                            content = new GZIPInputStream(content);
                        }
                        File inputStreamToTempFile = inputStreamToTempFile(content);
                        IOUtils.closeQuietly(content);
                        parseLocation(inputStreamToTempFile, httpClient, sitemapURLAdder, set, str);
                        LOG.info("         Resolved: " + str);
                    } else if (statusCode == 404) {
                        LOG.debug("Sitemap not found : " + str);
                        if (this.escalateErrors) {
                            throw new RuntimeException("Sitemap not found : " + str);
                        }
                    } else {
                        LOG.error("Could not obtain sitemap: " + str + ".  Expected status code " + GenericHttpClientFactory.DEFAULT_MAX_CONNECTIONS + ", but got " + statusCode);
                        if (this.escalateErrors) {
                            throw new RuntimeException("Could not obtain sitemap: " + str + ".  Expected status code " + GenericHttpClientFactory.DEFAULT_MAX_CONNECTIONS + ", but got " + statusCode);
                        }
                    }
                    set.add(str);
                    if (httpGet2 != null) {
                        httpGet2.releaseConnection();
                    }
                } catch (Exception e) {
                    LOG.error("Cannot fetch sitemap: " + str + " (" + e.getMessage() + ")");
                    if (this.escalateErrors) {
                        throw new RuntimeException(e);
                    }
                    set.add(str);
                    if (0 != 0) {
                        httpGet.releaseConnection();
                    }
                }
            } catch (XMLStreamException e2) {
                LOG.error("Cannot fetch sitemap: " + str + " -- Likely an invalid sitemap XML format causing a parsing error (actual error: " + e2.getMessage() + ").");
                if (this.escalateErrors) {
                    throw new RuntimeException((Throwable) e2);
                }
                set.add(str);
                if (0 != 0) {
                    httpGet.releaseConnection();
                }
            }
        } catch (Throwable th) {
            set.add(str);
            if (0 != 0) {
                httpGet.releaseConnection();
            }
            throw th;
        }
    }

    private File inputStreamToTempFile(InputStream inputStream) throws IOException {
        File tempDir = getTempDir();
        if (tempDir == null) {
            tempDir = FileUtils.getTempDirectory();
        }
        File createTempFile = File.createTempFile("sitemap-", ".xml", tempDir);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Temporarily saving sitemap at: " + createTempFile.getAbsolutePath());
        }
        FileUtils.copyInputStreamToFile(inputStream, createTempFile);
        return createTempFile;
    }

    /* JADX WARN: Failed to find 'out' block for switch in B:7:0x005d. Please report as an issue. */
    private void parseLocation(File file, HttpClient httpClient, SitemapURLAdder sitemapURLAdder, Set<String> set, String str) throws XMLStreamException, IOException {
        FileInputStream fileInputStream = new FileInputStream(file);
        Throwable th = null;
        try {
            try {
                XMLInputFactory newInstance = XMLInputFactory.newInstance();
                newInstance.setProperty("javax.xml.stream.isCoalescing", true);
                XMLStreamReader createXMLStreamReader = newInstance.createXMLStreamReader(new StripInvalidCharInputStream(fileInputStream));
                ParseState parseState = new ParseState();
                String substringBeforeLast = StringUtils.substringBeforeLast(str, SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN);
                int eventType = createXMLStreamReader.getEventType();
                while (true) {
                    if (this.stopped) {
                        LOG.debug("Sitemap not entirely parsed due to crawler being stopped.");
                    } else {
                        switch (eventType) {
                            case 1:
                                parseStartElement(parseState, createXMLStreamReader.getLocalName());
                                break;
                            case 2:
                                parseEndElement(sitemapURLAdder, parseState, substringBeforeLast, createXMLStreamReader.getLocalName());
                                break;
                            case 4:
                                String text = createXMLStreamReader.getText();
                                if (!parseState.sitemapIndex || !parseState.loc) {
                                    if (parseState.baseURL != null) {
                                        parseCharacters(parseState, text);
                                        break;
                                    }
                                } else {
                                    resolveLocation(text, httpClient, sitemapURLAdder, set);
                                    parseState.loc = false;
                                    break;
                                }
                                break;
                        }
                        if (createXMLStreamReader.hasNext()) {
                            eventType = createXMLStreamReader.next();
                        }
                    }
                }
                if (fileInputStream != null) {
                    if (0 != 0) {
                        try {
                            fileInputStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        fileInputStream.close();
                    }
                }
                FileUtil.delete(file);
            } finally {
            }
        } catch (Throwable th3) {
            if (fileInputStream != null) {
                if (th != null) {
                    try {
                        fileInputStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    fileInputStream.close();
                }
            }
            throw th3;
        }
    }

    private boolean passedFrom(ParseState parseState) {
        Long sitemapLastMod = parseState.baseURL.getSitemapLastMod();
        return this.from <= 0 || sitemapLastMod == null || sitemapLastMod.longValue() > this.from;
    }

    private void parseEndElement(SitemapURLAdder sitemapURLAdder, ParseState parseState, String str, String str2) {
        if ("sitemap".equalsIgnoreCase(str2)) {
            parseState.sitemapIndex = false;
            return;
        }
        if (!"url".equalsIgnoreCase(str2) || parseState.baseURL.getReference() == null) {
            return;
        }
        if (isRelaxed(parseState, str)) {
            if (passedFrom(parseState)) {
                sitemapURLAdder.add(parseState.baseURL);
            }
        } else if (LOG.isDebugEnabled()) {
            LOG.debug("Sitemap URL invalid for location directory. URL:" + parseState.baseURL.getReference() + " Location directory: " + str);
        }
        parseState.baseURL = null;
    }

    private boolean isRelaxed(ParseState parseState, String str) {
        return this.lenient || parseState.baseURL.getReference().startsWith(str);
    }

    private void parseCharacters(ParseState parseState, String str) {
        if (parseState.loc) {
            parseState.baseURL.setReference(str);
            parseState.loc = false;
            return;
        }
        if (parseState.lastmod) {
            try {
                parseState.baseURL.setSitemapLastMod(Long.valueOf(DateTime.parse(str).getMillis()));
            } catch (Exception e) {
                LOG.info("Invalid sitemap date: " + str);
            }
            parseState.lastmod = false;
        } else if (parseState.changefreq) {
            parseState.baseURL.setSitemapChangeFreq(str);
            parseState.changefreq = false;
        } else if (parseState.priority) {
            try {
                parseState.baseURL.setSitemapPriority(Float.valueOf(Float.parseFloat(str)));
            } catch (NumberFormatException e2) {
                LOG.info("Invalid sitemap priority: " + str);
            }
            parseState.priority = false;
        }
    }

    private void parseStartElement(ParseState parseState, String str) {
        if ("sitemap".equalsIgnoreCase(str)) {
            parseState.sitemapIndex = true;
            return;
        }
        if ("url".equalsIgnoreCase(str)) {
            parseState.baseURL = new HttpCrawlData("", 0);
            return;
        }
        if ("loc".equalsIgnoreCase(str)) {
            parseState.loc = true;
            return;
        }
        if ("lastmod".equalsIgnoreCase(str)) {
            parseState.lastmod = true;
        } else if ("changefreq".equalsIgnoreCase(str)) {
            parseState.changefreq = true;
        } else if ("priority".equalsIgnoreCase(str)) {
            parseState.priority = true;
        }
    }

    private Set<String> combineLocations(String[] strArr, String str) {
        HashSet hashSet = new HashSet();
        if (ArrayUtils.isNotEmpty(strArr)) {
            hashSet.addAll(Arrays.asList(strArr));
        }
        String[] sitemapPaths = getSitemapPaths();
        if (ArrayUtils.isEmpty(sitemapPaths)) {
            LOG.debug("No sitemap paths specified.");
            return hashSet;
        }
        for (String str2 : sitemapPaths) {
            if (!str2.startsWith(SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN)) {
                str2 = SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN + str2;
            }
            hashSet.add(str + str2);
        }
        return hashSet;
    }

    public boolean equals(Object obj) {
        if (!(obj instanceof StandardSitemapResolver)) {
            return false;
        }
        StandardSitemapResolver standardSitemapResolver = (StandardSitemapResolver) obj;
        return new EqualsBuilder().append(this.lenient, standardSitemapResolver.lenient).append(this.tempDir, standardSitemapResolver.tempDir).append(this.sitemapPaths, standardSitemapResolver.sitemapPaths).isEquals();
    }

    public int hashCode() {
        return new HashCodeBuilder().append(this.lenient).append(this.tempDir).append(this.sitemapPaths).toHashCode();
    }

    public String toString() {
        return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE).append("lenient", this.lenient).append("tempDir", this.tempDir).append("sitemapPaths", this.sitemapPaths).toString();
    }
}
