package com.norconex.collector.http.url.impl;

import com.norconex.collector.http.url.ILinkExtractor;
import com.norconex.collector.http.url.Link;
import com.norconex.commons.lang.config.IXMLConfigurable;
import com.norconex.commons.lang.config.XMLConfigurationUtil;
import com.norconex.commons.lang.file.ContentType;
import com.norconex.commons.lang.url.HttpURL;
import com.norconex.commons.lang.xml.EnhancedXMLStreamWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.Writer;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.builder.EqualsBuilder;
import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.DefaultHtmlMapper;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.LinkContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:com/norconex/collector/http/url/impl/TikaLinkExtractor.class */
public class TikaLinkExtractor implements ILinkExtractor, IXMLConfigurable {
    private static final int URL_PATTERN_GROUP_URL = 5;
    private ContentType[] contentTypes = DEFAULT_CONTENT_TYPES;
    private boolean ignoreNofollow;
    private static final Logger LOG = LogManager.getLogger(TikaLinkExtractor.class);
    private static final ContentType[] DEFAULT_CONTENT_TYPES = {ContentType.HTML, ContentType.valueOf("application/xhtml+xml"), ContentType.valueOf("vnd.wap.xhtml+xml"), ContentType.valueOf("x-asp")};
    private static final Pattern META_REFRESH_PATTERN = Pattern.compile("(\\W|^)(url)(\\s*=\\s*)([\"']{0,1})(.+?)([\"'>])", 34);
    private static final HtmlMapper fixedHtmlMapper = new FixedHtmlParserMapper();

    /* loaded from: input_file:com/norconex/collector/http/url/impl/TikaLinkExtractor$FixedHtmlParserMapper.class */
    private static class FixedHtmlParserMapper extends DefaultHtmlMapper {
        private FixedHtmlParserMapper() {
        }

        public String mapSafeAttribute(String str, String str2) {
            return ("a".equals(str) && "title".equals(str2)) ? "title" : super.mapSafeAttribute(str, str2);
        }
    }

    @Override // com.norconex.collector.http.url.ILinkExtractor
    public Set<Link> extractLinks(InputStream inputStream, String str, ContentType contentType) throws IOException {
        LinkContentHandler linkContentHandler = new LinkContentHandler();
        Metadata metadata = new Metadata();
        ParseContext parseContext = new ParseContext();
        parseContext.set(HtmlMapper.class, fixedHtmlMapper);
        try {
            new HtmlParser().parse(inputStream, linkContentHandler, metadata, parseContext);
            IOUtils.closeQuietly(inputStream);
            List<org.apache.tika.sax.Link> links = linkContentHandler.getLinks();
            HashSet hashSet = new HashSet(links.size());
            for (org.apache.tika.sax.Link link : links) {
                if (isIgnoreNofollow() || !"nofollow".equalsIgnoreCase(StringUtils.trim(link.getRel()))) {
                    String uri = link.getUri();
                    if (!StringUtils.isBlank(uri)) {
                        String resolve = uri.startsWith("?") ? str + uri : uri.startsWith("#") ? str + uri : resolve(str, uri);
                        if (StringUtils.isNotBlank(resolve)) {
                            Link link2 = new Link(resolve);
                            link2.setReferrer(str);
                            if (StringUtils.isNotBlank(link.getText())) {
                                link2.setText(link.getText());
                            }
                            if (link.isAnchor()) {
                                link2.setTag("a.href");
                            } else if (link.isImage()) {
                                link2.setTag("img.src");
                            }
                            if (StringUtils.isNotBlank(link.getTitle())) {
                                link2.setTitle(link.getTitle());
                            }
                            hashSet.add(link2);
                        }
                    }
                }
            }
            String caseInsensitive = getCaseInsensitive(metadata, "refresh");
            if (StringUtils.isNotBlank(caseInsensitive)) {
                Matcher matcher = META_REFRESH_PATTERN.matcher(caseInsensitive);
                if (matcher.find()) {
                    caseInsensitive = matcher.group(URL_PATTERN_GROUP_URL);
                }
                String resolve2 = resolve(str, caseInsensitive);
                if (StringUtils.isNotBlank(resolve2)) {
                    Link link3 = new Link(resolve2);
                    link3.setReferrer(str);
                    hashSet.add(link3);
                }
            }
            return hashSet;
        } catch (TikaException | SAXException e) {
            throw new IOException("Could not parse to extract URLs: " + str, e);
        }
    }

    private String getCaseInsensitive(Metadata metadata, String str) {
        for (String str2 : metadata.names()) {
            if (StringUtils.equalsIgnoreCase(str2, str)) {
                return metadata.get(str2);
            }
        }
        return null;
    }

    public ContentType[] getContentTypes() {
        return (ContentType[]) ArrayUtils.clone(this.contentTypes);
    }

    public void setContentTypes(ContentType... contentTypeArr) {
        this.contentTypes = (ContentType[]) ArrayUtils.clone(contentTypeArr);
    }

    public boolean isIgnoreNofollow() {
        return this.ignoreNofollow;
    }

    public void setIgnoreNofollow(boolean z) {
        this.ignoreNofollow = z;
    }

    @Deprecated
    public boolean isKeepReferrerData() {
        return true;
    }

    @Deprecated
    public void setKeepReferrerData(boolean z) {
        LOG.warn("Since 2.6.0, referrer data is always kept. Setting \"keepReferrerData\" has no effect.");
    }

    @Override // com.norconex.collector.http.url.ILinkExtractor
    public boolean accepts(String str, ContentType contentType) {
        if (ArrayUtils.isEmpty(this.contentTypes)) {
            return true;
        }
        return ArrayUtils.contains(this.contentTypes, contentType);
    }

    private String resolve(String str, String str2) {
        return HttpURL.toAbsolute(str, str2);
    }

    public void loadFromXML(Reader reader) {
        XMLConfiguration newXMLConfiguration = XMLConfigurationUtil.newXMLConfiguration(reader);
        setIgnoreNofollow(newXMLConfiguration.getBoolean("[@ignoreNofollow]", isIgnoreNofollow()));
        ContentType[] valuesOf = ContentType.valuesOf(StringUtils.split(StringUtils.trimToNull(newXMLConfiguration.getString("contentTypes")), ", "));
        if (ArrayUtils.isEmpty(valuesOf)) {
            return;
        }
        setContentTypes(valuesOf);
    }

    public void saveToXML(Writer writer) throws IOException {
        try {
            EnhancedXMLStreamWriter enhancedXMLStreamWriter = new EnhancedXMLStreamWriter(writer);
            enhancedXMLStreamWriter.writeStartElement("extractor");
            enhancedXMLStreamWriter.writeAttribute("class", getClass().getCanonicalName());
            enhancedXMLStreamWriter.writeAttributeBoolean("ignoreNofollow", Boolean.valueOf(isIgnoreNofollow()));
            if (!ArrayUtils.isEmpty(getContentTypes())) {
                enhancedXMLStreamWriter.writeElementString("contentTypes", StringUtils.join(getContentTypes(), ','));
            }
            enhancedXMLStreamWriter.writeEndElement();
            enhancedXMLStreamWriter.flush();
            enhancedXMLStreamWriter.close();
        } catch (XMLStreamException e) {
            throw new IOException("Cannot save as XML.", e);
        }
    }

    public String toString() {
        return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE).append("contentTypes", this.contentTypes).append("ignoreNofollow", this.ignoreNofollow).toString();
    }

    public boolean equals(Object obj) {
        if (!(obj instanceof TikaLinkExtractor)) {
            return false;
        }
        TikaLinkExtractor tikaLinkExtractor = (TikaLinkExtractor) obj;
        return new EqualsBuilder().append(this.contentTypes, tikaLinkExtractor.contentTypes).append(this.ignoreNofollow, tikaLinkExtractor.ignoreNofollow).isEquals();
    }

    public int hashCode() {
        return new HashCodeBuilder().append(this.contentTypes).append(this.ignoreNofollow).toHashCode();
    }
}
