package com.norconex.collector.http.url.impl;

import com.norconex.collector.http.filter.impl.SegmentCountURLFilter;
import com.norconex.collector.http.url.ILinkExtractor;
import com.norconex.collector.http.url.Link;
import com.norconex.commons.lang.config.IXMLConfigurable;
import com.norconex.commons.lang.config.XMLConfigurationUtil;
import com.norconex.commons.lang.file.ContentType;
import com.norconex.commons.lang.xml.EnhancedXMLStreamWriter;
import com.norconex.importer.util.CharsetUtil;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.collections4.map.ListOrderedMap;
import org.apache.commons.configuration.HierarchicalConfiguration;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.tika.utils.CharsetUtils;

/* loaded from: input_file:com/norconex/collector/http/url/impl/RegexLinkExtractor.class */
public class RegexLinkExtractor implements ILinkExtractor, IXMLConfigurable {
    private static final Logger LOG = LogManager.getLogger(RegexLinkExtractor.class);
    public static final String DEFAULT_CONTENT_TYPE_PATTERN = "text/.*";
    public static final int MAX_BUFFER_SIZE = 1048576;
    public static final int OVERLAP_SIZE = 8192;
    public static final int DEFAULT_MAX_URL_LENGTH = 2048;
    private static final int LOGGING_MAX_URL_LENGTH = 200;
    private String charset;
    private String applyToReferencePattern;
    private int maxURLLength = 2048;
    private String applyToContentTypePattern = DEFAULT_CONTENT_TYPE_PATTERN;
    private final Map<String, String> patterns = new ListOrderedMap();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/norconex/collector/http/url/impl/RegexLinkExtractor$Referer.class */
    public static class Referer {
        private final String scheme;
        private final String path;
        private final String relativeBase;
        private final String absoluteBase;
        private final String documentBase;
        private final String url;

        public Referer(String str) {
            this.url = str;
            this.scheme = str.replaceFirst("(.*?:(//){0,1})(.*)", "$1");
            this.path = str.replaceFirst("(.*?:(//){0,1})(.*)", "$3");
            this.relativeBase = this.scheme + this.path.replaceFirst("(.*?)([\\?\\#])(.*)", "$1").replaceFirst("(.*/)(.*)", "$1");
            this.absoluteBase = this.scheme + this.path.replaceFirst("(.*?)(/.*)", "$1");
            this.documentBase = this.scheme + this.path.replaceFirst("(.*?)([\\?\\#])(.*)", "$1");
            if (RegexLinkExtractor.LOG.isDebugEnabled()) {
                RegexLinkExtractor.LOG.debug("DOCUMENT URL ----> " + str);
                RegexLinkExtractor.LOG.debug("  BASE RELATIVE -> " + this.relativeBase);
                RegexLinkExtractor.LOG.debug("  BASE ABSOLUTE -> " + this.absoluteBase);
            }
        }
    }

    @Override // com.norconex.collector.http.url.ILinkExtractor
    public Set<Link> extractLinks(InputStream inputStream, String str, ContentType contentType) throws IOException {
        String charset = getCharset();
        String str2 = (String) StringUtils.defaultIfBlank(StringUtils.isBlank(charset) ? CharsetUtil.detectCharset(inputStream) : CharsetUtils.clean(charset), StandardCharsets.UTF_8.toString());
        Referer referer = new Referer(str);
        HashSet hashSet = new HashSet();
        StringBuilder sb = new StringBuilder();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, str2));
        while (true) {
            int read = bufferedReader.read();
            if (read == -1) {
                extractLinks(sb.toString(), referer, hashSet);
                sb.setLength(0);
                return hashSet;
            }
            sb.append((char) read);
            if (sb.length() >= 1048576) {
                extractLinks(sb.toString(), referer, hashSet);
                sb.delete(0, sb.length() - 8192);
            }
        }
    }

    @Override // com.norconex.collector.http.url.ILinkExtractor
    public boolean accepts(String str, ContentType contentType) {
        if (!StringUtils.isNotBlank(this.applyToReferencePattern) || Pattern.matches(this.applyToReferencePattern, str)) {
            return !StringUtils.isNotBlank(this.applyToContentTypePattern) || Pattern.matches(this.applyToContentTypePattern, contentType.toString());
        }
        return false;
    }

    public int getMaxURLLength() {
        return this.maxURLLength;
    }

    public void setMaxURLLength(int i) {
        this.maxURLLength = i;
    }

    public String getCharset() {
        return this.charset;
    }

    public void setCharset(String str) {
        this.charset = str;
    }

    public String getApplyToContentTypePattern() {
        return this.applyToContentTypePattern;
    }

    public void setApplyToContentTypePattern(String str) {
        this.applyToContentTypePattern = str;
    }

    public String getApplyToReferencePattern() {
        return this.applyToReferencePattern;
    }

    public void setApplyToReferencePattern(String str) {
        this.applyToReferencePattern = str;
    }

    public List<String> getPatterns() {
        return new ArrayList(this.patterns.keySet());
    }

    public String getPatternReplacement(String str) {
        return this.patterns.get(str);
    }

    @Deprecated
    public int getPatternMatchGroup(String str) {
        String patternReplacement = getPatternReplacement(str);
        if (patternReplacement == null || !patternReplacement.matches("^\\$\\d+$")) {
            return -1;
        }
        return Integer.parseInt(StringUtils.removeStart(patternReplacement, "$"));
    }

    public void clearPatterns() {
        this.patterns.clear();
    }

    public void addPattern(String str) {
        this.patterns.put(str, null);
    }

    public void addPattern(String str, String str2) {
        this.patterns.put(str, str2);
    }

    @Deprecated
    public void addPattern(String str, int i) {
        this.patterns.put(str, "$" + i);
    }

    private void extractLinks(String str, Referer referer, Set<Link> set) {
        for (Map.Entry<String, String> entry : this.patterns.entrySet()) {
            String key = entry.getKey();
            String value = entry.getValue();
            Matcher matcher = Pattern.compile(key).matcher(str);
            while (matcher.find()) {
                String group = matcher.group();
                if (StringUtils.isNotBlank(value)) {
                    group = group.replaceFirst(key, value);
                }
                String cleanAbsoluteURL = toCleanAbsoluteURL(referer, group);
                if (cleanAbsoluteURL != null) {
                    Link link = new Link(cleanAbsoluteURL);
                    link.setReferrer(referer.url);
                    set.add(link);
                }
            }
        }
    }

    private String toCleanAbsoluteURL(Referer referer, String str) {
        String trimToNull = StringUtils.trimToNull(str);
        if (trimToNull == null) {
            return null;
        }
        String unescapeHtml4 = StringEscapeUtils.unescapeHtml4(trimToNull);
        if (unescapeHtml4.startsWith("//")) {
            unescapeHtml4 = referer.scheme + StringUtils.substringAfter(unescapeHtml4, "//");
        } else if (unescapeHtml4.startsWith(SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN)) {
            unescapeHtml4 = referer.absoluteBase + unescapeHtml4;
        } else if (unescapeHtml4.startsWith("?") || unescapeHtml4.startsWith("#")) {
            unescapeHtml4 = referer.documentBase + unescapeHtml4;
        } else if (!unescapeHtml4.contains(":")) {
            unescapeHtml4 = referer.relativeBase.endsWith(SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN) ? referer.relativeBase + unescapeHtml4 : referer.relativeBase + SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN + unescapeHtml4;
        }
        if (unescapeHtml4.length() <= this.maxURLLength) {
            return unescapeHtml4;
        }
        LOG.debug("URL length (" + unescapeHtml4.length() + ") exceeding maximum length allowed (" + this.maxURLLength + ") to be extracted. URL (showing first 200 chars): " + StringUtils.substring(unescapeHtml4, 0, 200) + "...");
        return null;
    }

    public void loadFromXML(Reader reader) {
        XMLConfiguration newXMLConfiguration = XMLConfigurationUtil.newXMLConfiguration(reader);
        setMaxURLLength(newXMLConfiguration.getInt("[@maxURLLength]", getMaxURLLength()));
        setCharset(newXMLConfiguration.getString("[@charset]", getCharset()));
        setApplyToContentTypePattern(newXMLConfiguration.getString("applyToContentTypePattern", getApplyToContentTypePattern()));
        setApplyToReferencePattern(newXMLConfiguration.getString("applyToReferencePattern", getApplyToReferencePattern()));
        List<HierarchicalConfiguration> configurationsAt = newXMLConfiguration.configurationsAt("linkExtractionPatterns.pattern");
        if (configurationsAt.isEmpty()) {
            return;
        }
        clearPatterns();
        for (HierarchicalConfiguration hierarchicalConfiguration : configurationsAt) {
            String string = hierarchicalConfiguration.getString("", (String) null);
            if (StringUtils.isNotBlank(string)) {
                LOG.warn("The regular expression is now expected to be in a <match> tag.");
            }
            String string2 = hierarchicalConfiguration.getString("match", string);
            String str = null;
            Integer integer = hierarchicalConfiguration.getInteger("[@group]", (Integer) null);
            if (integer != null) {
                LOG.warn("The \"group\" attribute is deprecated. Use <replace> instead.");
                str = "$" + integer;
            }
            String string3 = hierarchicalConfiguration.getString("replace", str);
            if (StringUtils.isNotBlank(string2)) {
                addPattern(string2, string3);
            }
        }
    }

    public void saveToXML(Writer writer) throws IOException {
        try {
            EnhancedXMLStreamWriter enhancedXMLStreamWriter = new EnhancedXMLStreamWriter(writer);
            enhancedXMLStreamWriter.writeStartElement("extractor");
            enhancedXMLStreamWriter.writeAttribute("class", getClass().getCanonicalName());
            enhancedXMLStreamWriter.writeAttributeInteger("maxURLLength", Integer.valueOf(getMaxURLLength()));
            enhancedXMLStreamWriter.writeAttributeString("charset", getCharset());
            enhancedXMLStreamWriter.writeElementString("applyToContentTypePattern", getApplyToContentTypePattern());
            enhancedXMLStreamWriter.writeElementString("applyToReferencePattern", getApplyToReferencePattern());
            enhancedXMLStreamWriter.writeStartElement("linkExtractionPatterns");
            for (Map.Entry<String, String> entry : this.patterns.entrySet()) {
                enhancedXMLStreamWriter.writeStartElement("pattern");
                enhancedXMLStreamWriter.writeElementString("match", entry.getKey());
                enhancedXMLStreamWriter.writeElementString("replace", entry.getValue());
                enhancedXMLStreamWriter.writeEndElement();
            }
            enhancedXMLStreamWriter.writeEndElement();
            enhancedXMLStreamWriter.writeEndElement();
            enhancedXMLStreamWriter.flush();
            enhancedXMLStreamWriter.close();
        } catch (XMLStreamException e) {
            throw new IOException("Cannot save as XML.", e);
        }
    }

    public String toString() {
        return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE).append("maxURLLength", this.maxURLLength).append("charset", this.charset).append("applyToContentTypePattern", this.applyToContentTypePattern).append("applyToReferencePattern", this.applyToReferencePattern).append("linkExtractionPatterns", this.patterns).toString();
    }

    public boolean equals(Object obj) {
        if (!(obj instanceof RegexLinkExtractor)) {
            return false;
        }
        RegexLinkExtractor regexLinkExtractor = (RegexLinkExtractor) obj;
        return new EqualsBuilder().append(this.maxURLLength, regexLinkExtractor.maxURLLength).append(this.charset, regexLinkExtractor.charset).append(this.applyToContentTypePattern, regexLinkExtractor.applyToContentTypePattern).append(this.applyToReferencePattern, regexLinkExtractor.applyToReferencePattern).append(this.patterns, regexLinkExtractor.patterns).isEquals();
    }

    public int hashCode() {
        return new HashCodeBuilder().append(this.maxURLLength).append(this.charset).append(this.applyToContentTypePattern).append(this.applyToReferencePattern).append(this.patterns).toHashCode();
    }
}
