package com.norconex.collector.http.url.impl;

import com.norconex.collector.http.url.ILinkExtractor;
import com.norconex.collector.http.url.Link;
import com.norconex.commons.lang.config.XMLConfigurationUtil;
import com.norconex.commons.lang.file.ContentType;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.junit.Assert;
import org.junit.Test;

/* loaded from: input_file:com/norconex/collector/http/url/impl/LinkExtractorTest.class */
public class LinkExtractorTest {
    @Test
    public void testGenericLinkExtractor() throws IOException {
        GenericLinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        genericLinkExtractor.addLinkTag("link", (String) null);
        testLinkExtraction(genericLinkExtractor);
    }

    @Test
    public void testTikaLinkExtractor() throws IOException {
        testLinkExtraction(new TikaLinkExtractor());
    }

    private void testLinkExtraction(ILinkExtractor iLinkExtractor) throws IOException {
        String str = "http://www.example.com/test/";
        String str2 = str + "LinkExtractorTest.html";
        String[] strArr = {"http://www.example.com/meta-redirect.html", "http://www.example.com/startWithDoubleslash.html", str2 + "?startWith=questionmark", str2 + "#startWithHashMark", "http://www.example.com/startWithSlash.html", str + "relativeToLastSegment.html", "http://www.sample.com/blah.html", "http://www.example.com/onTwoLines.html", "http://www.example.com/imageSlash.gif", "http://www.example.com/imageNoSlash.gif", str + "titleTarget.html", "http://www.example.com/htmlEntities", "http://www.example.com/?p1=v1&p2=v2&p3=v3", "http://www.example.com/contains two spaces.html"};
        String[] strArr2 = {"http://www.example.com/badhref.html", "http://www.example.com/nofollow.html", "http://www.example.com//dont/process/scripts/'+variable+'", "http://www.example.com//dont/process/a/'+inscript+'", "http://www.example.com/comment.html", str};
        if (iLinkExtractor instanceof GenericLinkExtractor) {
            strArr = (String[]) ArrayUtils.addAll(strArr, new String[]{"http://www.example.com/addedTagNoAttribUrlInBody.html", "http://www.example.com/addedTagAttribUrlInBody.html"});
            strArr2 = (String[]) ArrayUtils.addAll(strArr2, new String[]{"tel:123", "mailto:blah@blah.com"});
        }
        if (iLinkExtractor instanceof TikaLinkExtractor) {
            strArr = (String[]) ArrayUtils.addAll(strArr, new String[]{"tel:123", "mailto:blah@blah.com"});
        }
        InputStream resourceAsStream = getClass().getResourceAsStream("LinkExtractorTest.html");
        Set<Link> extractLinks = iLinkExtractor.extractLinks(resourceAsStream, str2, ContentType.HTML);
        IOUtils.closeQuietly(resourceAsStream);
        for (String str3 : strArr) {
            Assert.assertTrue("Could not find expected URL: " + str3, contains(extractLinks, str3));
        }
        for (String str4 : strArr2) {
            Assert.assertFalse("Found unexpected URL: " + str4, contains(extractLinks, str4));
        }
        Assert.assertEquals("Invalid number of links extracted.", strArr.length, extractLinks.size());
    }

    @Test
    public void testGenericBaseHrefLinkExtractor() throws IOException {
        GenericLinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        testBaseHrefLinkExtraction(genericLinkExtractor);
        testRelativeBaseHrefLinkExtraction(genericLinkExtractor);
    }

    @Test
    public void testTikaBaseHrefLinkExtractor() throws IOException {
        testBaseHrefLinkExtraction(new TikaLinkExtractor());
    }

    private void testBaseHrefLinkExtraction(ILinkExtractor iLinkExtractor) throws IOException {
        InputStream resourceAsStream = getClass().getResourceAsStream("LinkBaseHrefTest.html");
        Set<Link> extractLinks = iLinkExtractor.extractLinks(resourceAsStream, "http://www.example.com/test/absolute/LinkBaseHrefTest.html", ContentType.HTML);
        IOUtils.closeQuietly(resourceAsStream);
        for (String str : new String[]{("http://www.sample.com/blah/") + "a/b/c.html", "http://www.sample.com/d/e/f.html", "http://www.sample.com/g/h/i.html", "http://www.anotherhost.com/k/l/m.html"}) {
            Assert.assertTrue("Could not find expected URL: " + str, contains(extractLinks, str));
        }
        Assert.assertEquals("Invalid number of links extracted.", r0.length, extractLinks.size());
    }

    private void testRelativeBaseHrefLinkExtraction(ILinkExtractor iLinkExtractor) throws IOException {
        InputStream resourceAsStream = getClass().getResourceAsStream("LinkRelativeBaseHrefTest.html");
        Set<Link> extractLinks = iLinkExtractor.extractLinks(resourceAsStream, "http://www.example.com/test/relative/LinkRelativeBaseHrefTest.html", ContentType.HTML);
        IOUtils.closeQuietly(resourceAsStream);
        for (String str : new String[]{"http://www.example.com/test/relative/blah.html?param=value", "http://www.example.com/d/e/f.html", "http://www.example.com/test/relative/path^blah.html", "http://www.anotherhost.com/k/l/m.html"}) {
            Assert.assertTrue("Could not find expected URL: " + str, contains(extractLinks, str));
        }
        Assert.assertEquals("Invalid number of links extracted.", r0.length, extractLinks.size());
    }

    @Test
    public void testGenericLinkKeepReferrer() throws IOException {
        GenericLinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        genericLinkExtractor.setContentTypes(new ContentType[]{ContentType.HTML});
        testLinkKeepReferrer(genericLinkExtractor);
    }

    @Test
    public void testTikaLinkKeepReferrer() throws IOException {
        TikaLinkExtractor tikaLinkExtractor = new TikaLinkExtractor();
        tikaLinkExtractor.setContentTypes(new ContentType[]{ContentType.HTML});
        testLinkKeepReferrer(tikaLinkExtractor);
    }

    private void testLinkKeepReferrer(ILinkExtractor iLinkExtractor) throws IOException {
        Link[] linkArr = {keepReferrerLink("1-notitle-notext.html", null, null), keepReferrerLink("2-notitle-yestext.html", "2 Yes Text", null), keepReferrerLink("3-yestitle-yestext.html", "3 Yes Text", "3 Yes Title"), keepReferrerLink("4-yestitle-notext.html", null, "4 Yes Title"), keepReferrerLink("6-yestitle-yestexthtml.html", "[6]Yes Text", "6 Yes Title")};
        InputStream resourceAsStream = getClass().getResourceAsStream("LinkKeepReferrerTest.html");
        Set<Link> extractLinks = iLinkExtractor.extractLinks(resourceAsStream, "http://www.site.com/parent.html", ContentType.HTML);
        IOUtils.closeQuietly(resourceAsStream);
        Assert.assertEquals(linkArr.length, extractLinks.size());
        for (Link link : linkArr) {
            Assert.assertTrue("Could not find expected link: " + link, contains(extractLinks, link));
        }
    }

    private Link keepReferrerLink(String str, String str2, String str3) {
        Link link = new Link("http://www.site.com/" + str);
        link.setReferrer("http://www.site.com/parent.html");
        link.setTag("a.href");
        link.setText(str2);
        link.setTitle(str3);
        return link;
    }

    @Test
    public void testExtractBetween() throws IOException {
        String[] strArr = {"http://www.example.com/include1.html", "http://www.example.com/include2.html", "http://www.example.com/include3.html", "http://www.example.com/include4.html", "http://www.example.com/include5.html"};
        String[] strArr2 = {"http://www.example.com/exclude1.html", "http://www.example.com/exclude2.html", "http://www.example.com/exclude3.html", "http://www.example.com/exclude4.html", "http://www.example.com/exclude5.html", "http://www.example.com/exclude6.html", "http://www.example.com/exclude7.html"};
        GenericLinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        genericLinkExtractor.addExtractBetween("<include1>", "</include1>\\s+", true);
        genericLinkExtractor.addExtractBetween("<Include2>", "</Include2>\\s+", false);
        genericLinkExtractor.addNoExtractBetween("<exclude1>", "</exclude1>\\s+", true);
        genericLinkExtractor.addNoExtractBetween("<Exclude2>", "</Exclude2>\\s+", false);
        InputStream resourceAsStream = getClass().getResourceAsStream("LinkExtractBetweenTest.html");
        Throwable th = null;
        try {
            try {
                Set<Link> extractLinks = genericLinkExtractor.extractLinks(resourceAsStream, "http://www.example.com/LinkExtractBetweenTest.html", ContentType.HTML);
                if (resourceAsStream != null) {
                    if (0 != 0) {
                        try {
                            resourceAsStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        resourceAsStream.close();
                    }
                }
                for (String str : strArr) {
                    Assert.assertTrue("Could not find expected URL: " + str, contains(extractLinks, str));
                }
                for (String str2 : strArr2) {
                    Assert.assertFalse("Found unexpected URL: " + str2, contains(extractLinks, str2));
                }
                Assert.assertEquals("Invalid number of links extracted.", strArr.length, extractLinks.size());
            } finally {
            }
        } catch (Throwable th3) {
            if (resourceAsStream != null) {
                if (th != null) {
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    resourceAsStream.close();
                }
            }
            throw th3;
        }
    }

    @Test
    public void testExtractSelector() throws IOException {
        String[] strArr = {"http://www.example.com/include1.html", "http://www.example.com/include2.html", "http://www.example.com/include3.html", "http://www.example.com/include4.html", "http://www.example.com/include5.html"};
        String[] strArr2 = {"http://www.example.com/exclude1.html", "http://www.example.com/exclude2.html", "http://www.example.com/exclude3.html", "http://www.example.com/exclude4.html", "http://www.example.com/exclude5.html", "http://www.example.com/exclude6.html", "http://www.example.com/exclude7.html"};
        GenericLinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        genericLinkExtractor.addExtractSelectors(new String[]{"include1"});
        genericLinkExtractor.addExtractSelectors(new String[]{"include2"});
        genericLinkExtractor.addNoExtractSelectors(new String[]{"exclude1"});
        genericLinkExtractor.addNoExtractSelectors(new String[]{"exclude2"});
        InputStream resourceAsStream = getClass().getResourceAsStream("LinkExtractBetweenTest.html");
        Throwable th = null;
        try {
            try {
                Set<Link> extractLinks = genericLinkExtractor.extractLinks(resourceAsStream, "http://www.example.com/LinkExtractBetweenTest.html", ContentType.HTML);
                if (resourceAsStream != null) {
                    if (0 != 0) {
                        try {
                            resourceAsStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        resourceAsStream.close();
                    }
                }
                for (String str : strArr) {
                    Assert.assertTrue("Could not find expected URL: " + str, contains(extractLinks, str));
                }
                for (String str2 : strArr2) {
                    Assert.assertFalse("Found unexpected URL: " + str2, contains(extractLinks, str2));
                }
                Assert.assertEquals("Invalid number of links extracted.", strArr.length, extractLinks.size());
            } finally {
            }
        } catch (Throwable th3) {
            if (resourceAsStream != null) {
                if (th != null) {
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    resourceAsStream.close();
                }
            }
            throw th3;
        }
    }

    @Test
    public void testGenericWriteRead() throws IOException {
        GenericLinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        genericLinkExtractor.setContentTypes(new ContentType[]{ContentType.HTML, ContentType.XML});
        genericLinkExtractor.setIgnoreNofollow(true);
        genericLinkExtractor.addLinkTag("food", "chocolate");
        genericLinkExtractor.addLinkTag("friend", "Thor");
        genericLinkExtractor.addExtractBetween("start1", "end1", true);
        genericLinkExtractor.addExtractBetween("start2", "end2", false);
        genericLinkExtractor.addNoExtractBetween("nostart1", "noend1", true);
        genericLinkExtractor.addNoExtractBetween("nostart2", "noend2", false);
        System.out.println("Writing/Reading this: " + genericLinkExtractor);
        XMLConfigurationUtil.assertWriteRead(genericLinkExtractor);
    }

    @Test
    public void testGenericEquivRefreshIssue210() throws IOException {
        Set extractLinks = new GenericLinkExtractor().extractLinks(new ByteArrayInputStream("<html><head><meta http-equiv=\"refresh\" content=\"0; URL=en/91/index.html\"></head><body></body></html>".getBytes()), "http://db-artmag.com/index_en.html", ContentType.HTML);
        Assert.assertEquals("Invalid number of links extracted.", 1L, extractLinks.size());
        Assert.assertEquals("http://db-artmag.com/en/91/index.html", ((Link) extractLinks.iterator().next()).getUrl());
    }

    @Test
    public void testTikaWriteRead() throws IOException {
        TikaLinkExtractor tikaLinkExtractor = new TikaLinkExtractor();
        tikaLinkExtractor.setContentTypes(new ContentType[]{ContentType.HTML, ContentType.XML});
        tikaLinkExtractor.setIgnoreNofollow(true);
        System.out.println("Writing/Reading this: " + tikaLinkExtractor);
        XMLConfigurationUtil.assertWriteRead(tikaLinkExtractor);
    }

    @Test
    public void testIssue188() throws IOException {
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream("<html><body><a href=\"/en/articles/detail/article-x.html\">test link</a></body></html>".getBytes());
        Set<Link> extractLinks = new GenericLinkExtractor().extractLinks(byteArrayInputStream, "http://www.site.com/en/articles/articles.html?param1=value1&param2=value2", ContentType.HTML);
        byteArrayInputStream.close();
        Assert.assertTrue("URL not extracted: http://www.site.com/en/articles/detail/article-x.html", contains(extractLinks, "http://www.site.com/en/articles/detail/article-x.html"));
    }

    @Test
    public void testIssue236() throws IOException {
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(("<html><body><a href=\"javascript:__doPostBack('MoreInfoList1$Pager','2')\">JavaScript link</a></body></html>").getBytes());
        GenericLinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        genericLinkExtractor.setSchemes(new String[]{"javascript"});
        Set<Link> extractLinks = genericLinkExtractor.extractLinks(byteArrayInputStream, "N/A", ContentType.HTML);
        byteArrayInputStream.close();
        Assert.assertTrue("URL not extracted: javascript:__doPostBack('MoreInfoList1$Pager','2')", contains(extractLinks, "javascript:__doPostBack('MoreInfoList1$Pager','2')"));
    }

    @Test
    public void testGenericBadlyFormedURL() throws IOException {
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream("<html><body><a href=\"/invalid^path^.html\">test link</a></body></html>".getBytes());
        Set<Link> extractLinks = new GenericLinkExtractor().extractLinks(byteArrayInputStream, "http://www.example.com/index.html", ContentType.HTML);
        byteArrayInputStream.close();
        Assert.assertTrue("URL not extracted: http://www.example.com/invalid^path^.html", contains(extractLinks, "http://www.example.com/invalid^path^.html"));
    }

    @Test
    public void testUnquottedURL() throws IOException {
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream("<html><body><a href=unquoted_url1.html>test link 1</a><a href=unquoted_url2.html title=\"blah\">test link 2</a></body></html>".getBytes());
        Set<Link> extractLinks = new GenericLinkExtractor().extractLinks(byteArrayInputStream, "http://www.example.com/index.html", ContentType.HTML);
        byteArrayInputStream.close();
        Assert.assertTrue("Could not find expected URL: http://www.example.com/unquoted_url1.html", contains(extractLinks, "http://www.example.com/unquoted_url1.html"));
        Assert.assertTrue("Could not find expected URL: http://www.example.com/unquoted_url2.html", contains(extractLinks, "http://www.example.com/unquoted_url2.html"));
    }

    @Test
    public void testBadQuotingURL() throws IOException {
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream("<html><body><a href=bad\"quote1.html>test link 1</a><a href=\"bad'quote2.html\">test link 1</a><a href='bad\"quote3.html'>test link 1</a></body></html>".getBytes());
        Set<Link> extractLinks = new GenericLinkExtractor().extractLinks(byteArrayInputStream, "http://www.example.com/index.html", ContentType.HTML);
        byteArrayInputStream.close();
        Assert.assertTrue("Could not find expected URL: http://www.example.com/bad\"quote1.html", contains(extractLinks, "http://www.example.com/bad\"quote1.html"));
        Assert.assertTrue("Could not find expected URL: http://www.example.com/bad'quote2.html", contains(extractLinks, "http://www.example.com/bad'quote2.html"));
        Assert.assertTrue("Could not find expected URL: http://www.example.com/bad\"quote3.html", contains(extractLinks, "http://www.example.com/bad\"quote3.html"));
    }

    private boolean contains(Set<Link> set, String str) {
        Iterator<Link> it = set.iterator();
        while (it.hasNext()) {
            if (str.equals(it.next().getUrl())) {
                return true;
            }
        }
        return false;
    }

    private boolean contains(Set<Link> set, Link link) {
        Iterator<Link> it = set.iterator();
        while (it.hasNext()) {
            if (link.equals(it.next())) {
                return true;
            }
        }
        return false;
    }
}
