package com.norconex.collector.http.url.impl;

import com.norconex.collector.http.filter.impl.SegmentCountURLFilter;
import com.norconex.collector.http.url.ILinkExtractor;
import com.norconex.collector.http.url.Link;
import com.norconex.commons.lang.config.IXMLConfigurable;
import com.norconex.commons.lang.config.XMLConfigurationUtil;
import com.norconex.commons.lang.file.ContentType;
import com.norconex.commons.lang.map.Properties;
import com.norconex.commons.lang.xml.EnhancedXMLStreamWriter;
import com.norconex.importer.util.CharsetUtil;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.configuration.HierarchicalConfiguration;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.tika.utils.CharsetUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/* loaded from: input_file:com/norconex/collector/http/url/impl/GenericLinkExtractor.class */
public class GenericLinkExtractor implements ILinkExtractor, IXMLConfigurable {
    public static final int MAX_BUFFER_SIZE = 1048576;
    public static final int OVERLAP_SIZE = 8192;
    public static final int DEFAULT_MAX_URL_LENGTH = 2048;
    private static final int LOGGING_MAX_URL_LENGTH = 200;
    private boolean ignoreNofollow;
    private Pattern tagPattern;
    private String charset;
    private boolean commentsEnabled;
    private static final Logger LOG = LogManager.getLogger(GenericLinkExtractor.class);
    private static final ContentType[] DEFAULT_CONTENT_TYPES = {ContentType.HTML, ContentType.valueOf("application/xhtml+xml"), ContentType.valueOf("vnd.wap.xhtml+xml"), ContentType.valueOf("x-asp")};
    private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"};
    private static final int PATTERN_FLAGS = 34;
    private static final Pattern BASE_HREF_PATTERN = Pattern.compile("<base[^<]+?href\\s*=\\s*([\"']{0,1})(.*?)\\1", PATTERN_FLAGS);
    private static final Pattern A_TEXT_PATTERN = Pattern.compile("<a[^<]+?>(.*?)<\\s*/\\s*a\\s*>", PATTERN_FLAGS);
    private static final Pattern A_TITLE_PATTERN = Pattern.compile("\\s*title\\s*=\\s*([\"'])(.*?)\\1", PATTERN_FLAGS);
    private static final Pattern SCRIPT_PATTERN = Pattern.compile("(<\\s*script\\b.*?>)(.*?)(<\\s*/\\s*script\\s*>)", PATTERN_FLAGS);
    private static final Pattern COMMENT_PATTERN = Pattern.compile("<!--.*?-->", PATTERN_FLAGS);
    private static final Pattern META_EQUIV_REFRESH_PATTERN = Pattern.compile("(^|\\W+)http-equiv\\s*=\\s*[\"']{0,1}refresh[\"']{0,1}", PATTERN_FLAGS);
    private static final Pattern META_CONTENT_URL_PATTERN = Pattern.compile("(^|\\W+)content\\s*=\\s*([\"'])[^a-zA-Z]*url\\s*=\\s*([\"']{0,1})([^\\<\\>\"']+?)[\\<\\>\"'].*?", PATTERN_FLAGS);
    private static final Pattern NOFOLLOW_PATTERN = Pattern.compile("(^|\\s)rel\\s*=\\s*([\"']{0,1})(\\s*nofollow\\s*)\\2", PATTERN_FLAGS);
    private static final Pattern SCHEME_PATTERN = Pattern.compile("^[a-z][a-z0-9\\+\\.\\-]*:.*$", 2);
    private ContentType[] contentTypes = DEFAULT_CONTENT_TYPES;
    private String[] schemes = DEFAULT_SCHEMES;
    private int maxURLLength = 2048;
    private final Properties tagAttribs = new Properties(true);
    private final List<String> extractSelectors = new ArrayList();
    private final List<String> noExtractSelectors = new ArrayList();
    private final List<RegexPair> extractBetweens = new ArrayList();
    private final List<RegexPair> noExtractBetweens = new ArrayList();
    private final List<Pattern> nofollowPatterns = new ArrayList();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/norconex/collector/http/url/impl/GenericLinkExtractor$Referer.class */
    public static class Referer {
        private final String scheme;
        private final String path;
        private final String relativeBase;
        private final String absoluteBase;
        private final String documentBase;
        private final String url;

        public Referer(String str) {
            this.url = str;
            this.scheme = str.replaceFirst("(.*?:(//){0,1})(.*)", "$1");
            this.path = str.replaceFirst("(.*?:(//){0,1})(.*)", "$3");
            this.relativeBase = this.scheme + this.path.replaceFirst("(.*?)([\\?\\#])(.*)", "$1").replaceFirst("(.*/)(.*)", "$1");
            this.absoluteBase = this.scheme + this.path.replaceFirst("(.*?)(/.*)", "$1");
            this.documentBase = this.scheme + this.path.replaceFirst("(.*?)([\\?\\#])(.*)", "$1");
            if (GenericLinkExtractor.LOG.isDebugEnabled()) {
                GenericLinkExtractor.LOG.debug("DOCUMENT URL ----> " + str);
                GenericLinkExtractor.LOG.debug("  BASE RELATIVE -> " + this.relativeBase);
                GenericLinkExtractor.LOG.debug("  BASE ABSOLUTE -> " + this.absoluteBase);
            }
        }
    }

    /* loaded from: input_file:com/norconex/collector/http/url/impl/GenericLinkExtractor$RegexPair.class */
    public static class RegexPair {
        private final String start;
        private final String end;
        private final boolean caseSensitive;

        public RegexPair(String str, String str2, boolean z) {
            this.start = str;
            this.end = str2;
            this.caseSensitive = z;
        }

        public String getStart() {
            return this.start;
        }

        public String getEnd() {
            return this.end;
        }

        public boolean isCaseSensitive() {
            return this.caseSensitive;
        }

        public boolean equals(Object obj) {
            return EqualsBuilder.reflectionEquals(this, obj, false);
        }

        public int hashCode() {
            return HashCodeBuilder.reflectionHashCode(this, false);
        }

        public String toString() {
            return ReflectionToStringBuilder.toString(this, ToStringStyle.SHORT_PREFIX_STYLE);
        }
    }

    public GenericLinkExtractor() {
        addLinkTag("a", "href");
        addLinkTag("frame", "src");
        addLinkTag("iframe", "src");
        addLinkTag("img", "src");
        addLinkTag("meta", "http-equiv");
    }

    @Override // com.norconex.collector.http.url.ILinkExtractor
    public Set<Link> extractLinks(InputStream inputStream, String str, ContentType contentType) throws IOException {
        ContentType[] contentTypeArr = this.contentTypes;
        if (ArrayUtils.isEmpty(contentTypeArr)) {
            contentTypeArr = DEFAULT_CONTENT_TYPES;
        }
        if (!ArrayUtils.contains(contentTypeArr, contentType)) {
            return null;
        }
        Iterator<Pattern> it = this.nofollowPatterns.iterator();
        while (it.hasNext()) {
            if (it.next().matcher(str).matches()) {
                return null;
            }
        }
        String charset = getCharset();
        String str2 = (String) StringUtils.defaultIfBlank(StringUtils.isBlank(charset) ? CharsetUtil.detectCharset(inputStream) : CharsetUtils.clean(charset), StandardCharsets.UTF_8.toString());
        Referer referer = new Referer(str);
        HashSet hashSet = new HashSet();
        StringBuilder sb = new StringBuilder();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, str2));
        boolean z = true;
        while (true) {
            int read = bufferedReader.read();
            if (read == -1) {
                String sb2 = sb.toString();
                extractLinks(sb2, adjustReferer(sb2, referer, z), hashSet);
                sb.setLength(0);
                return hashSet;
            }
            sb.append((char) read);
            if (sb.length() >= 1048576) {
                String sb3 = sb.toString();
                referer = adjustReferer(sb3, referer, z);
                z = false;
                extractLinks(sb3, referer, hashSet);
                sb.delete(0, sb.length() - 8192);
            }
        }
    }

    private Referer adjustReferer(String str, Referer referer, boolean z) {
        Referer referer2 = referer;
        if (z) {
            Matcher matcher = BASE_HREF_PATTERN.matcher(str);
            if (matcher.find()) {
                String group = matcher.group(2);
                if (StringUtils.isNotBlank(group)) {
                    referer2 = new Referer(toCleanAbsoluteURL(referer, group));
                }
            }
        }
        return referer2;
    }

    @Override // com.norconex.collector.http.url.ILinkExtractor
    public boolean accepts(String str, ContentType contentType) {
        if (ArrayUtils.isEmpty(this.contentTypes)) {
            return true;
        }
        return ArrayUtils.contains(this.contentTypes, contentType);
    }

    public int getMaxURLLength() {
        return this.maxURLLength;
    }

    public void setMaxURLLength(int i) {
        this.maxURLLength = i;
    }

    public ContentType[] getContentTypes() {
        return (ContentType[]) ArrayUtils.clone(this.contentTypes);
    }

    public void setContentTypes(ContentType... contentTypeArr) {
        this.contentTypes = (ContentType[]) ArrayUtils.clone(contentTypeArr);
    }

    public RegexPair[] getExtractBetweens() {
        return (RegexPair[]) this.extractBetweens.toArray(new RegexPair[0]);
    }

    public void setExtractBetweens(RegexPair... regexPairArr) {
        this.extractBetweens.clear();
        this.extractBetweens.addAll(Arrays.asList(regexPairArr));
    }

    public void addExtractBetween(String str, String str2, boolean z) {
        this.extractBetweens.add(new RegexPair(str, str2, z));
    }

    public RegexPair[] getNoExtractBetweens() {
        return (RegexPair[]) this.noExtractBetweens.toArray(new RegexPair[0]);
    }

    public void setNoExtractBetweens(RegexPair... regexPairArr) {
        this.noExtractBetweens.clear();
        this.noExtractBetweens.addAll(Arrays.asList(regexPairArr));
    }

    public void addNoExtractBetween(String str, String str2, boolean z) {
        this.noExtractBetweens.add(new RegexPair(str, str2, z));
    }

    public String[] getExtractSelectors() {
        return (String[]) this.extractSelectors.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
    }

    public void setExtractSelectors(String... strArr) {
        this.extractSelectors.clear();
        this.extractSelectors.addAll(Arrays.asList(strArr));
    }

    public void addExtractSelectors(String... strArr) {
        this.extractSelectors.addAll(Arrays.asList(strArr));
    }

    public String[] getNoExtractSelectors() {
        return (String[]) this.noExtractSelectors.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
    }

    public void setNoExtractSelectors(String... strArr) {
        this.noExtractSelectors.clear();
        this.noExtractSelectors.addAll(Arrays.asList(strArr));
    }

    public void addNoExtractSelectors(String... strArr) {
        this.noExtractSelectors.addAll(Arrays.asList(strArr));
    }

    public List<String> getNofollowPatterns() {
        ArrayList arrayList = new ArrayList(this.nofollowPatterns.size());
        Iterator<Pattern> it = this.nofollowPatterns.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().pattern());
        }
        return arrayList;
    }

    public void setNofollowPatterns(List<String> list) {
        this.nofollowPatterns.clear();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            this.nofollowPatterns.add(Pattern.compile(it.next()));
        }
    }

    public void addNofollowPatterns(String str) {
        this.nofollowPatterns.add(Pattern.compile(str));
    }

    public boolean isCommentsEnabled() {
        return this.commentsEnabled;
    }

    public void setCommentsEnabled(boolean z) {
        this.commentsEnabled = z;
    }

    public String[] getSchemes() {
        return this.schemes;
    }

    public void setSchemes(String... strArr) {
        this.schemes = strArr;
    }

    public boolean isIgnoreNofollow() {
        return this.ignoreNofollow;
    }

    public void setIgnoreNofollow(boolean z) {
        this.ignoreNofollow = z;
    }

    @Deprecated
    public boolean isKeepReferrerData() {
        return true;
    }

    @Deprecated
    public void setKeepReferrerData(boolean z) {
        LOG.warn("Since 2.6.0, referrer data is always kept. Setting \"keepReferrerData\" has no effect.");
    }

    public String getCharset() {
        return this.charset;
    }

    public void setCharset(String str) {
        this.charset = str;
    }

    public synchronized void addLinkTag(String str, String str2) {
        this.tagAttribs.addString(str, new String[]{str2});
        resetTagPattern();
    }

    public synchronized void removeLinkTag(String str, String str2) {
        if (str2 == null) {
            this.tagAttribs.remove(str);
        } else {
            List strings = this.tagAttribs.getStrings(str);
            strings.remove(str2);
            if (strings.isEmpty()) {
                this.tagAttribs.remove(str);
            } else {
                this.tagAttribs.setString(str, (String[]) strings.toArray(ArrayUtils.EMPTY_STRING_ARRAY));
            }
        }
        resetTagPattern();
    }

    public synchronized void clearLinkTags() {
        this.tagAttribs.clear();
        resetTagPattern();
    }

    private void resetTagPattern() {
        this.tagPattern = Pattern.compile("<(" + StringUtils.join(this.tagAttribs.keySet(), '|') + ")((\\s*>)|(\\s([^\\<]*?)>))", PATTERN_FLAGS);
    }

    private Pattern getTagBodyPattern(String str) {
        return Pattern.compile("<\\s*" + str + "[^<]*?>([^<]*?)<\\s*/\\s*" + str + "\\s*>", PATTERN_FLAGS);
    }

    private void extractLinks(String str, Referer referer, Set<Link> set) {
        String cleanAbsoluteURL;
        String replaceAll = SCRIPT_PATTERN.matcher(excludeUnwantedContent(str)).replaceAll("$1$3");
        if (!isCommentsEnabled()) {
            replaceAll = COMMENT_PATTERN.matcher(replaceAll).replaceAll("");
        }
        Matcher matcher = this.tagPattern.matcher(replaceAll);
        while (matcher.find()) {
            String group = matcher.group(1);
            String group2 = matcher.group(4);
            String string = this.tagAttribs.getString(group);
            if (StringUtils.isBlank(string)) {
                Matcher matcher2 = getTagBodyPattern(group).matcher(replaceAll);
                if (matcher2.find(matcher.start()) && (cleanAbsoluteURL = toCleanAbsoluteURL(referer, matcher2.group(1).trim())) != null) {
                    Link link = new Link(cleanAbsoluteURL);
                    link.setReferrer(referer.url);
                    link.setTag(group);
                    set.add(link);
                }
            } else {
                String str2 = null;
                if (!StringUtils.isBlank(group2)) {
                    if ("meta".equalsIgnoreCase(group)) {
                        extractMetaRefresh(group2, referer, set);
                    } else {
                        if ("a".equalsIgnoreCase(group)) {
                            if (this.ignoreNofollow || !isNofollow(group2)) {
                                Matcher matcher3 = A_TEXT_PATTERN.matcher(replaceAll);
                                r14 = matcher3.find(matcher.start()) ? matcher3.group(1).trim().replaceAll("<[^>]*>", "") : null;
                                Matcher matcher4 = A_TITLE_PATTERN.matcher(group2);
                                if (matcher4.find()) {
                                    str2 = matcher4.group(2).trim();
                                }
                            }
                        }
                        Matcher matcher5 = Pattern.compile("(^|\\s)(" + string + ")\\s*=\\s*((?<quot>[\"'])(?<url1>[^\\<\\>]*?)\\k<quot>|(?<url2>[^\\s\\>]+)[\\s\\>])", PATTERN_FLAGS).matcher(group2);
                        while (matcher5.find()) {
                            String group3 = matcher5.group(2);
                            String group4 = matcher5.group("url1") != null ? matcher5.group("url1") : matcher5.group("url2");
                            if (!StringUtils.isBlank(group4)) {
                                for (String str3 : "object".equalsIgnoreCase(group) ? StringUtils.split(group4, ' ') : "applet".equalsIgnoreCase(group) ? StringUtils.split(group4, ", ") : new String[]{group4}) {
                                    String cleanAbsoluteURL2 = toCleanAbsoluteURL(referer, str3);
                                    if (cleanAbsoluteURL2 != null) {
                                        Link link2 = new Link(cleanAbsoluteURL2);
                                        link2.setReferrer(referer.url);
                                        link2.setTag(group + "." + group3);
                                        if (StringUtils.isNotBlank(r14)) {
                                            link2.setText(r14);
                                        }
                                        if (StringUtils.isNotBlank(str2)) {
                                            link2.setTitle(str2);
                                        }
                                        set.add(link2);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    private String excludeUnwantedContent(String str) {
        String str2 = str;
        if (!this.extractBetweens.isEmpty()) {
            str2 = applyExtractBetweens(str2);
        }
        if (!this.noExtractBetweens.isEmpty()) {
            str2 = applyNoExtractBetweens(str2);
        }
        if (!this.extractSelectors.isEmpty()) {
            str2 = applyExtractSelectors(str2);
        }
        if (!this.noExtractSelectors.isEmpty()) {
            str2 = applyNoExtractSelectors(str2);
        }
        return str2;
    }

    private String applyExtractBetweens(String str) {
        StringBuilder sb = new StringBuilder();
        Iterator<RegexPair> it = this.extractBetweens.iterator();
        while (it.hasNext()) {
            for (Pair<Integer, Integer> pair : matchBetweens(str, it.next())) {
                sb.append(str.substring(((Integer) pair.getLeft()).intValue(), ((Integer) pair.getRight()).intValue()));
            }
        }
        return sb.toString();
    }

    private String applyNoExtractBetweens(String str) {
        StringBuilder sb = new StringBuilder(str);
        Iterator<RegexPair> it = this.noExtractBetweens.iterator();
        while (it.hasNext()) {
            List<Pair<Integer, Integer>> matchBetweens = matchBetweens(str, it.next());
            for (int size = matchBetweens.size() - 1; size >= 0; size--) {
                Pair<Integer, Integer> pair = matchBetweens.get(size);
                sb.delete(((Integer) pair.getLeft()).intValue(), ((Integer) pair.getRight()).intValue());
            }
        }
        return sb.toString();
    }

    private List<Pair<Integer, Integer>> matchBetweens(String str, RegexPair regexPair) {
        int i = 32;
        if (!regexPair.isCaseSensitive()) {
            i = 32 | 2 | 64;
        }
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile(regexPair.getStart(), i).matcher(str);
        while (matcher.find()) {
            Matcher matcher2 = Pattern.compile(regexPair.getEnd(), i).matcher(str);
            if (!matcher2.find(matcher.end())) {
                break;
            }
            arrayList.add(new ImmutablePair(Integer.valueOf(matcher.start()), Integer.valueOf(matcher2.end())));
        }
        return arrayList;
    }

    private String applyExtractSelectors(String str) {
        StringBuilder sb = new StringBuilder();
        Document parse = Jsoup.parse(str);
        Iterator<String> it = this.extractSelectors.iterator();
        while (it.hasNext()) {
            Iterator it2 = parse.select(it.next()).iterator();
            while (it2.hasNext()) {
                Element element = (Element) it2.next();
                if (sb.length() > 0) {
                    sb.append(" ");
                }
                sb.append(element.html());
            }
        }
        return sb.toString();
    }

    private String applyNoExtractSelectors(String str) {
        Document parse = Jsoup.parse(str);
        Iterator<String> it = this.noExtractSelectors.iterator();
        while (it.hasNext()) {
            Iterator it2 = parse.select(it.next()).iterator();
            while (it2.hasNext()) {
                ((Element) it2.next()).remove();
            }
        }
        return parse.toString();
    }

    private void extractMetaRefresh(String str, Referer referer, Set<Link> set) {
        if (META_EQUIV_REFRESH_PATTERN.matcher(str).find()) {
            Matcher matcher = META_CONTENT_URL_PATTERN.matcher(str);
            if (matcher.find()) {
                Link link = new Link(toCleanAbsoluteURL(referer, matcher.group(4)));
                link.setReferrer(referer.url);
                link.setTag("meta.http-equiv.refresh");
                set.add(link);
            }
        }
    }

    private boolean isNofollow(String str) {
        if (StringUtils.isBlank(str)) {
            return false;
        }
        return NOFOLLOW_PATTERN.matcher(str).find();
    }

    private String toCleanAbsoluteURL(Referer referer, String str) {
        String trimToNull = StringUtils.trimToNull(str);
        if (!isValidNewURL(trimToNull)) {
            return null;
        }
        String unescapeHtml4 = StringEscapeUtils.unescapeHtml4(trimToNull);
        if (!isValidNewURL(unescapeHtml4)) {
            return null;
        }
        if (unescapeHtml4.startsWith("//")) {
            unescapeHtml4 = referer.scheme + StringUtils.substringAfter(unescapeHtml4, "//");
        } else if (unescapeHtml4.startsWith(SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN)) {
            unescapeHtml4 = referer.absoluteBase + unescapeHtml4;
        } else if (unescapeHtml4.startsWith("?") || unescapeHtml4.startsWith("#")) {
            unescapeHtml4 = referer.documentBase + unescapeHtml4;
        } else if (!unescapeHtml4.contains(":")) {
            unescapeHtml4 = referer.relativeBase.endsWith(SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN) ? referer.relativeBase + unescapeHtml4 : referer.relativeBase + SegmentCountURLFilter.DEFAULT_SEGMENT_SEPARATOR_PATTERN + unescapeHtml4;
        }
        if (unescapeHtml4.length() <= this.maxURLLength) {
            return unescapeHtml4;
        }
        LOG.debug("URL length (" + unescapeHtml4.length() + ") exceeding maximum length allowed (" + this.maxURLLength + ") to be extracted. URL (showing first 200 chars): " + StringUtils.substring(unescapeHtml4, 0, 200) + "...");
        return null;
    }

    private boolean isValidNewURL(String str) {
        if (StringUtils.isBlank(str)) {
            return false;
        }
        if (!SCHEME_PATTERN.matcher(str).matches()) {
            return true;
        }
        String[] schemes = getSchemes();
        if (ArrayUtils.isEmpty(schemes)) {
            schemes = DEFAULT_SCHEMES;
        }
        for (String str2 : schemes) {
            if (StringUtils.startsWithIgnoreCase(str, str2 + ":")) {
                return true;
            }
        }
        return false;
    }

    public void loadFromXML(Reader reader) {
        XMLConfiguration newXMLConfiguration = XMLConfigurationUtil.newXMLConfiguration(reader);
        setMaxURLLength(newXMLConfiguration.getInt("[@maxURLLength]", getMaxURLLength()));
        setIgnoreNofollow(newXMLConfiguration.getBoolean("[@ignoreNofollow]", isIgnoreNofollow()));
        setCommentsEnabled(newXMLConfiguration.getBoolean("[@commentsEnabled]", isCommentsEnabled()));
        setCharset(newXMLConfiguration.getString("[@charset]", getCharset()));
        if (newXMLConfiguration.getBoolean("[@keepFragment]", false)) {
            LOG.warn("'keepFragment' on GenericLinkExtractor was removed. Instead, URL normalization now always takes place by default unless disabled, and removeFragment is part of the default normalization rules.");
        }
        ContentType[] valuesOf = ContentType.valuesOf(StringUtils.split(StringUtils.trimToNull(newXMLConfiguration.getString("contentTypes")), ", "));
        if (!ArrayUtils.isEmpty(valuesOf)) {
            setContentTypes(valuesOf);
        }
        String[] split = StringUtils.split(StringUtils.trimToNull(newXMLConfiguration.getString("schemes")), ", ");
        if (!ArrayUtils.isEmpty(split)) {
            setSchemes(split);
        }
        List<HierarchicalConfiguration> configurationsAt = newXMLConfiguration.configurationsAt("tags.tag");
        if (!configurationsAt.isEmpty()) {
            clearLinkTags();
            for (HierarchicalConfiguration hierarchicalConfiguration : configurationsAt) {
                String string = hierarchicalConfiguration.getString("[@name]", (String) null);
                String string2 = hierarchicalConfiguration.getString("[@attribute]", (String) null);
                if (StringUtils.isNotBlank(string)) {
                    addLinkTag(string, string2);
                }
            }
        }
        List<HierarchicalConfiguration> configurationsAt2 = newXMLConfiguration.configurationsAt("extractBetween");
        if (!configurationsAt2.isEmpty()) {
            this.extractBetweens.clear();
            for (HierarchicalConfiguration hierarchicalConfiguration2 : configurationsAt2) {
                addExtractBetween(hierarchicalConfiguration2.getString("start", (String) null), hierarchicalConfiguration2.getString("end", (String) null), hierarchicalConfiguration2.getBoolean("[@caseSensitive]", false));
            }
        }
        List<HierarchicalConfiguration> configurationsAt3 = newXMLConfiguration.configurationsAt("noExtractBetween");
        if (!configurationsAt3.isEmpty()) {
            this.noExtractBetweens.clear();
            for (HierarchicalConfiguration hierarchicalConfiguration3 : configurationsAt3) {
                addNoExtractBetween(hierarchicalConfiguration3.getString("start", (String) null), hierarchicalConfiguration3.getString("end", (String) null), hierarchicalConfiguration3.getBoolean("[@caseSensitive]", false));
            }
        }
        String[] stringArray = newXMLConfiguration.getStringArray("extractSelector");
        if (ArrayUtils.isNotEmpty(stringArray)) {
            this.extractSelectors.clear();
            this.extractSelectors.addAll(Arrays.asList(stringArray));
        }
        String[] stringArray2 = newXMLConfiguration.getStringArray("noExtractSelector");
        if (ArrayUtils.isNotEmpty(stringArray2)) {
            this.noExtractSelectors.clear();
            this.noExtractSelectors.addAll(Arrays.asList(stringArray2));
        }
        this.nofollowPatterns.clear();
        for (String str : newXMLConfiguration.getStringArray("nofollow.regexUrl")) {
            this.nofollowPatterns.add(Pattern.compile(str));
        }
    }

    public void saveToXML(Writer writer) throws IOException {
        try {
            EnhancedXMLStreamWriter enhancedXMLStreamWriter = new EnhancedXMLStreamWriter(writer);
            enhancedXMLStreamWriter.writeStartElement("extractor");
            enhancedXMLStreamWriter.writeAttribute("class", getClass().getCanonicalName());
            enhancedXMLStreamWriter.writeAttributeInteger("maxURLLength", Integer.valueOf(getMaxURLLength()));
            enhancedXMLStreamWriter.writeAttributeBoolean("ignoreNofollow", Boolean.valueOf(isIgnoreNofollow()));
            enhancedXMLStreamWriter.writeAttributeBoolean("commentsEnabled", Boolean.valueOf(isCommentsEnabled()));
            enhancedXMLStreamWriter.writeAttributeString("charset", getCharset());
            if (!ArrayUtils.isEmpty(getContentTypes())) {
                enhancedXMLStreamWriter.writeElementString("contentTypes", StringUtils.join(getContentTypes(), ','));
            }
            if (!ArrayUtils.isEmpty(getSchemes())) {
                enhancedXMLStreamWriter.writeElementString("schemes", StringUtils.join(getSchemes(), ','));
            }
            enhancedXMLStreamWriter.writeStartElement("tags");
            for (Map.Entry entry : this.tagAttribs.entrySet()) {
                for (String str : (List) entry.getValue()) {
                    enhancedXMLStreamWriter.writeStartElement("tag");
                    enhancedXMLStreamWriter.writeAttributeString("name", (String) entry.getKey());
                    enhancedXMLStreamWriter.writeAttributeString("attribute", str);
                    enhancedXMLStreamWriter.writeEndElement();
                }
            }
            enhancedXMLStreamWriter.writeEndElement();
            for (RegexPair regexPair : this.extractBetweens) {
                enhancedXMLStreamWriter.writeStartElement("extractBetween");
                enhancedXMLStreamWriter.writeAttributeBoolean("caseSensitive", Boolean.valueOf(regexPair.isCaseSensitive()));
                enhancedXMLStreamWriter.writeElementString("start", regexPair.getStart());
                enhancedXMLStreamWriter.writeElementString("end", regexPair.getEnd());
                enhancedXMLStreamWriter.writeEndElement();
            }
            for (RegexPair regexPair2 : this.noExtractBetweens) {
                enhancedXMLStreamWriter.writeStartElement("noExtractBetween");
                enhancedXMLStreamWriter.writeAttributeBoolean("caseSensitive", Boolean.valueOf(regexPair2.isCaseSensitive()));
                enhancedXMLStreamWriter.writeElementString("start", regexPair2.getStart());
                enhancedXMLStreamWriter.writeElementString("end", regexPair2.getEnd());
                enhancedXMLStreamWriter.writeEndElement();
            }
            Iterator<String> it = this.extractSelectors.iterator();
            while (it.hasNext()) {
                enhancedXMLStreamWriter.writeElementString("extractSelector", it.next());
            }
            Iterator<String> it2 = this.noExtractSelectors.iterator();
            while (it2.hasNext()) {
                enhancedXMLStreamWriter.writeElementString("noExtractSelector", it2.next());
            }
            if (!this.nofollowPatterns.isEmpty()) {
                enhancedXMLStreamWriter.writeStartElement("nofollow");
                Iterator<Pattern> it3 = this.nofollowPatterns.iterator();
                while (it3.hasNext()) {
                    enhancedXMLStreamWriter.writeElementString("regexUrl", it3.next().pattern());
                }
                enhancedXMLStreamWriter.writeEndElement();
            }
            enhancedXMLStreamWriter.writeEndElement();
            enhancedXMLStreamWriter.flush();
            enhancedXMLStreamWriter.close();
        } catch (XMLStreamException e) {
            throw new IOException("Cannot save as XML.", e);
        }
    }

    public String toString() {
        return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE).append("contentTypes", this.contentTypes).append("schemes", this.schemes).append("maxURLLength", this.maxURLLength).append("ignoreNofollow", this.ignoreNofollow).append("commentsEnabled", this.commentsEnabled).append("tagAttribs", this.tagAttribs).append("charset", this.charset).append("extractBetweens", this.extractBetweens).append("noExtractBetweens", this.noExtractBetweens).append("extractSelectors", this.extractSelectors).append("noExtractSelectors", this.noExtractSelectors).toString();
    }

    public boolean equals(Object obj) {
        if (!(obj instanceof GenericLinkExtractor)) {
            return false;
        }
        GenericLinkExtractor genericLinkExtractor = (GenericLinkExtractor) obj;
        return new EqualsBuilder().append(this.contentTypes, genericLinkExtractor.contentTypes).append(this.schemes, genericLinkExtractor.schemes).append(this.maxURLLength, genericLinkExtractor.maxURLLength).append(this.ignoreNofollow, genericLinkExtractor.ignoreNofollow).append(this.commentsEnabled, genericLinkExtractor.commentsEnabled).append(this.tagAttribs.entrySet(), genericLinkExtractor.tagAttribs.entrySet()).append(this.charset, genericLinkExtractor.charset).append(this.extractBetweens, genericLinkExtractor.extractBetweens).append(this.noExtractBetweens, genericLinkExtractor.noExtractBetweens).append(this.extractSelectors, genericLinkExtractor.extractSelectors).append(this.noExtractSelectors, genericLinkExtractor.noExtractSelectors).isEquals();
    }

    public int hashCode() {
        return new HashCodeBuilder().append(this.contentTypes).append(this.schemes).append(this.maxURLLength).append(this.ignoreNofollow).append(this.commentsEnabled).append(this.tagAttribs).append(this.charset).append(this.extractBetweens).append(this.noExtractBetweens).append(this.extractSelectors).append(this.noExtractSelectors).toHashCode();
    }
}
