package com.norconex.collector.http.crawler;

import com.norconex.collector.core.crawler.ICrawler;
import com.norconex.collector.core.crawler.event.CrawlerEvent;
import com.norconex.collector.core.crawler.event.ICrawlerEventListener;
import com.norconex.collector.http.HttpCollector;
import com.norconex.collector.http.doc.HttpDocument;
import com.norconex.collector.http.doc.HttpMetadata;
import com.norconex.collector.http.fetch.impl.GenericDocumentFetcher;
import com.norconex.collector.http.url.ILinkExtractor;
import com.norconex.collector.http.url.impl.GenericLinkExtractor;
import com.norconex.commons.lang.file.FileUtil;
import com.norconex.commons.lang.file.IFileVisitor;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.mutable.MutableObject;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.junit.Assert;
import org.junit.Test;

/* loaded from: input_file:com/norconex/collector/http/crawler/BasicFeaturesTest.class */
public class BasicFeaturesTest extends AbstractHttpTest {
    private static final Logger LOG = LogManager.getLogger(BasicFeaturesTest.class);

    @Test
    public void testRedirect() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=redirect");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        httpCrawler.getCrawlerConfig().setMaxDepth(0);
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        assertListSize("document", commitedDocuments, 1);
        HttpDocument httpDocument = commitedDocuments.get(0);
        String reference = httpDocument.getReference();
        List<?> strings = httpDocument.getMetadata().getStrings("collector.url");
        LOG.debug("URLs:" + strings);
        assertListSize("URL", strings, 1);
        Assert.assertTrue("Invalid redirection URL: " + reference, reference.contains("/test/redirected/page.html?case=redirect"));
        List<?> strings2 = httpDocument.getMetadata().getStrings("collector.referenced-urls");
        assertListSize("referenced URLs", strings2, 2);
        Assert.assertTrue("Invalid relative URL: " + ((String) strings2.get(0)), ((String) strings2.get(0)).matches(".*/test/redirected/page[12].html"));
        Assert.assertTrue("Invalid relative URL: " + ((String) strings2.get(1)), ((String) strings2.get(1)).matches(".*/test/redirected/page[12].html"));
    }

    @Test
    public void testMultiRedirects() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=multiRedirects");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        httpCrawler.getCrawlerConfig().setMaxDepth(0);
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        assertListSize("document", commitedDocuments, 1);
        HttpDocument httpDocument = commitedDocuments.get(0);
        String reference = httpDocument.getReference();
        List<?> strings = httpDocument.getMetadata().getStrings("collector.redirect-trail");
        LOG.debug("Redirect source URLs:" + strings);
        assertListSize("URL", strings, 5);
        Assert.assertFalse(((String) strings.get(0)).contains("count"));
        Assert.assertTrue(((String) strings.get(1)).contains("count=1"));
        Assert.assertTrue(((String) strings.get(2)).contains("count=2"));
        Assert.assertTrue(((String) strings.get(3)).contains("count=3"));
        Assert.assertTrue(((String) strings.get(4)).contains("count=4"));
        Assert.assertTrue("Invalid redirection URL: " + reference, reference.contains("count=5"));
    }

    @Test
    public void testCanonicalRedirectLoop() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=canonRedirLoop&type=canonical");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        assertListSize("document", commitedDocuments, 1);
        HttpDocument httpDocument = commitedDocuments.get(0);
        Assert.assertTrue("Wrong content", IOUtils.toString(httpDocument.getContent(), StandardCharsets.UTF_8).contains("Canonical-redirect circular reference"));
        Assert.assertTrue("Wrong reference", httpDocument.getReference().contains("&type=canonical"));
        LOG.info("FINAL REF: " + httpDocument.getReference());
        LOG.info("FINAL TRAIL:" + httpDocument.getMetadata().getStrings("collector.redirect-trail"));
        HttpCollector newHttpCollector1Crawler2 = newHttpCollector1Crawler("/test?case=canonRedirLoop&type=redirect");
        HttpCrawler httpCrawler2 = (HttpCrawler) newHttpCollector1Crawler2.getCrawlers()[0];
        newHttpCollector1Crawler2.start(false);
        List<HttpDocument> commitedDocuments2 = getCommitedDocuments(httpCrawler2);
        assertListSize("document", commitedDocuments2, 1);
        HttpDocument httpDocument2 = commitedDocuments2.get(0);
        Assert.assertTrue("Wrong content", IOUtils.toString(httpDocument2.getContent(), StandardCharsets.UTF_8).contains("Canonical-redirect circular reference"));
        Assert.assertTrue("Wrong reference", httpDocument2.getReference().contains("&type=canonical"));
        LOG.info("FINAL REF: " + httpDocument2.getReference());
        LOG.info("FINAL TRAIL:" + httpDocument2.getMetadata().getStrings("collector.redirect-trail"));
    }

    @Test
    public void testBasicFeatures() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=basic&depth=0");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        httpCrawler.getCrawlerConfig().setMaxDepth(10);
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        testDepth(commitedDocuments);
        Iterator<HttpDocument> it = commitedDocuments.iterator();
        while (it.hasNext()) {
            testValidMetadata(it.next());
        }
    }

    @Test
    public void testKeepDownload() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test/a$dir/blah?case=keepDownloads");
        HttpCrawler httpCrawler = newHttpCollector1Crawler.getCrawlers()[0];
        httpCrawler.getCrawlerConfig().setMaxDepth(0);
        httpCrawler.getCrawlerConfig().setKeepDownloads(true);
        newHttpCollector1Crawler.start(false);
        File file = new File(httpCrawler.getCrawlerConfig().getWorkDir(), "downloads");
        final MutableObject mutableObject = new MutableObject();
        FileUtil.visitAllFiles(file, new IFileVisitor() { // from class: com.norconex.collector.http.crawler.BasicFeaturesTest.1
            public void visit(File file2) {
                if (mutableObject.getValue() == null && file2.toString().contains("downloads")) {
                    mutableObject.setValue(file2);
                }
            }
        });
        Assert.assertTrue("Invalid or missing download file.", FileUtils.readFileToString((File) mutableObject.getValue(), StandardCharsets.UTF_8).contains("<b>This</b> file <i>must</i> be saved as is, with this <span>formatting</span>"));
    }

    @Test
    public void testMaxURLs() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=basic&depth=0");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        httpCrawler.getCrawlerConfig().setMaxDocuments(15);
        newHttpCollector1Crawler.start(false);
        assertListSize("URLs", getCommitedDocuments(httpCrawler), 15);
    }

    @Test
    public void testUserAgent() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=userAgent");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        httpCrawler.getCrawlerConfig().setMaxDepth(0);
        httpCrawler.getCrawlerConfig().setUserAgent("Super Secret Agent");
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        assertListSize("document", commitedDocuments, 1);
        Assert.assertTrue("Wrong or undetected User-Agent.", IOUtils.toString(commitedDocuments.get(0).getContent(), StandardCharsets.UTF_8).contains("Super Secret Agent"));
    }

    @Test
    public void testCanonicalLink() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=canonical");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        final MutableInt mutableInt = new MutableInt();
        httpCrawler.getCrawlerConfig().setCrawlerListeners(new ICrawlerEventListener[]{new ICrawlerEventListener() { // from class: com.norconex.collector.http.crawler.BasicFeaturesTest.2
            public void crawlerEvent(ICrawler iCrawler, CrawlerEvent crawlerEvent) {
                if ("REJECTED_NONCANONICAL".equals(crawlerEvent.getEventType())) {
                    mutableInt.increment();
                }
            }
        }});
        newHttpCollector1Crawler.start(false);
        assertListSize("document", getCommitedDocuments(httpCrawler), 1);
        Assert.assertEquals("Wrong number of canonical link rejection.", 2L, mutableInt.intValue());
    }

    @Test
    public void testSpecialURLs() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=specialURLs");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        newHttpCollector1Crawler.start(false);
        assertListSize("document", getCommitedDocuments(httpCrawler), 4);
    }

    @Test
    public void testScriptTags() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=script");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        ILinkExtractor genericLinkExtractor = new GenericLinkExtractor();
        genericLinkExtractor.addLinkTag("script", "src");
        httpCrawler.getCrawlerConfig().setLinkExtractors(new ILinkExtractor[]{genericLinkExtractor});
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        assertListSize("document", commitedDocuments, 2);
        for (HttpDocument httpDocument : commitedDocuments) {
            String iOUtils = IOUtils.toString(httpDocument.getContent(), StandardCharsets.UTF_8);
            if (httpDocument.getReference().contains("script=true")) {
                Assert.assertTrue("Script page not crawled properly", iOUtils.contains("This must be crawled"));
            } else {
                Assert.assertTrue("First page not crawled properly", iOUtils.contains("View the source"));
                Assert.assertTrue("Did not strip inside of <script>", !iOUtils.contains("THIS_MUST_BE_STRIPPED"));
            }
        }
    }

    @Test
    public void testJavaScriptURL() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=jsURL");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        URLCrawlScopeStrategy uRLCrawlScopeStrategy = new URLCrawlScopeStrategy();
        uRLCrawlScopeStrategy.setStayOnPort(true);
        httpCrawler.getCrawlerConfig().setUrlCrawlScopeStrategy(uRLCrawlScopeStrategy);
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        assertListSize("document", commitedDocuments, 2);
        for (HttpDocument httpDocument : commitedDocuments) {
            String iOUtils = IOUtils.toString(httpDocument.getContent(), StandardCharsets.UTF_8);
            if (httpDocument.getReference().contains("goodurl=true")) {
                Assert.assertTrue("Script page not crawled properly", iOUtils.contains("Must be crawled (2 of 2)"));
            } else {
                Assert.assertTrue("First page not crawled properly", iOUtils.contains("Must be crawled (1 of 2)"));
                Assert.assertEquals("Only 1 URL should have been extracted.", 1L, httpDocument.getMetadata().get("collector.referenced-urls").size());
            }
        }
    }

    @Test
    public void testZeroLength() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=zeroLength");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        newHttpCollector1Crawler.start(false);
        assertListSize("document", getCommitedDocuments(httpCrawler), 1);
    }

    @Test
    public void testContentTypeCharsetDefault() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=contentTypeCharset");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        assertListSize("document", commitedDocuments, 1);
        HttpDocument httpDocument = commitedDocuments.get(0);
        Assert.assertEquals("application/javascript", httpDocument.getMetadata().getString("document.contentType"));
        Assert.assertEquals("Big5", httpDocument.getMetadata().getString("document.contentEncoding"));
    }

    @Test
    public void testContentTypeCharsetDetect() throws IOException {
        HttpCollector newHttpCollector1Crawler = newHttpCollector1Crawler("/test?case=contentTypeCharset");
        HttpCrawler httpCrawler = (HttpCrawler) newHttpCollector1Crawler.getCrawlers()[0];
        GenericDocumentFetcher documentFetcher = httpCrawler.getCrawlerConfig().getDocumentFetcher();
        documentFetcher.setDetectContentType(true);
        documentFetcher.setDetectCharset(true);
        newHttpCollector1Crawler.start(false);
        List<HttpDocument> commitedDocuments = getCommitedDocuments(httpCrawler);
        assertListSize("document", commitedDocuments, 1);
        HttpDocument httpDocument = commitedDocuments.get(0);
        Assert.assertEquals("text/html", httpDocument.getMetadata().getString("document.contentType"));
        Assert.assertEquals(StandardCharsets.UTF_8.toString(), httpDocument.getMetadata().getString("document.contentEncoding"));
    }

    private void testDepth(List<HttpDocument> list) {
        Assert.assertEquals("Did not crawl the right depth.", 11L, list.size());
    }

    private void testValidMetadata(HttpDocument httpDocument) {
        HttpMetadata metadata = httpDocument.getMetadata();
        assertOneValue(metadata, "Content-Type", "collector.content-type", "collector.content-encoding");
        Assert.assertEquals("Bad HTTP content-type", "text/html; charset=UTF-8", metadata.getString("Content-Type"));
        Assert.assertEquals("Bad Collection content-type.", "text/html", metadata.getString("collector.content-type"));
        Assert.assertEquals("Bad char-encoding.", StandardCharsets.UTF_8.toString(), metadata.getString("collector.content-encoding"));
    }

    private void assertListSize(String str, List<?> list, int i) {
        Assert.assertEquals("Wrong " + str + " list size.", i, list.size());
    }

    private void assertOneValue(HttpMetadata httpMetadata, String... strArr) {
        for (String str : strArr) {
            Assert.assertEquals(str + " does not contain strickly 1 value.", 1L, httpMetadata.getStrings(r0).size());
        }
    }
}
