package com.casm.acled.crawler.spring;

import bithazard.sitemap.parser.SitemapParser;
import bithazard.sitemap.parser.model.InvalidSitemapUrlException;
import com.casm.acled.crawler.Crawl;
import com.casm.acled.crawler.management.CheckListService;
import com.casm.acled.crawler.management.ConfigService;
import com.casm.acled.crawler.management.CrawlArgs;
import com.casm.acled.crawler.management.CrawlArgsService;
import com.casm.acled.crawler.reporting.Reporter;
import com.casm.acled.crawler.scraper.ACLEDCommitter;
import com.casm.acled.crawler.util.CustomLoggerRepository;
import com.casm.acled.crawler.util.Util;
import com.casm.acled.dao.entities.ArticleDAO;
import com.casm.acled.dao.entities.SourceDAO;
import com.casm.acled.dao.entities.SourceListDAO;
import com.casm.acled.entities.source.Source;
import com.casm.acled.entities.sourcelist.SourceList;
import com.google.common.collect.ImmutableList;
import com.norconex.collector.http.crawler.HttpCrawlerConfig;
import com.norconex.collector.http.data.HttpCrawlData;
import com.norconex.collector.http.robot.impl.StandardRobotsTxtProvider;
import com.norconex.collector.http.sitemap.ISitemapResolver;
import com.norconex.collector.http.sitemap.SitemapURLAdder;
import com.norconex.collector.http.sitemap.impl.StandardSitemapResolverFactory;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.temporal.ChronoUnit;
import java.time.temporal.TemporalUnit;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import javax.ws.rs.ProcessingException;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.client.Client;
import javax.ws.rs.client.ClientBuilder;
import javax.ws.rs.client.WebTarget;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
import org.apache.log4j.spi.DefaultRepositorySelector;
import org.apache.log4j.spi.RootLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

@Component
/* loaded from: input_file:com/casm/acled/crawler/spring/CrawlService.class */
public class CrawlService {

    @Autowired
    private SourceListDAO sourceListDAO;

    @Autowired
    private SourceDAO sourceDAO;

    @Autowired
    private ArticleDAO articleDAO;

    @Autowired
    private Reporter reporter;

    @Autowired
    private CrawlArgsService argsService;

    @Autowired
    private CheckListService checkListService;

    @Autowired
    private ConfigService configService;
    protected static final Logger logger = LoggerFactory.getLogger(CrawlService.class);
    public static List<String> STANDARD_SITEMAP_LOCS = ImmutableList.of("sitemap.xml", "sitemap_index.xml");
    private static String SITEMAP = "Sitemap";

    public void run(int i, int i2, boolean z) {
        run(i, i2, null, null, z);
    }

    public void collectExamples(int i, int i2) {
        Optional byId = this.sourceListDAO.getById(i);
        Optional byId2 = this.sourceDAO.getById(i2);
        System.out.println(i);
        System.out.println(i2);
        if (!byId.isPresent() || !byId2.isPresent()) {
            throw new RuntimeException("source or source list not found!");
        }
        ACLEDCommitter aCLEDCommitter = new ACLEDCommitter(this.articleDAO, (Source) byId2.get(), this.sourceListDAO, true, true);
        aCLEDCommitter.setMaxArticles(10);
        CrawlArgs crawlArgs = this.argsService.get();
        crawlArgs.source = (Source) byId2.get();
        crawlArgs.sourceLists = ImmutableList.of(byId.get());
        crawlArgs.depth = 3;
        new Crawl(crawlArgs, aCLEDCommitter, this.reporter, ImmutableList.of()).run();
    }

    public void run(int i, int i2, LocalDate localDate, LocalDate localDate2, boolean z) {
        Optional byId = this.sourceListDAO.getById(i);
        Optional byId2 = this.sourceDAO.getById(i2);
        CrawlArgs crawlArgs = this.argsService.get();
        crawlArgs.source = (Source) byId2.get();
        crawlArgs.sourceLists = ImmutableList.of(byId.get());
        crawlArgs.from = localDate;
        crawlArgs.to = localDate2;
        crawlArgs.skipKeywords = Boolean.valueOf(z);
        run(crawlArgs);
    }

    public void run(CrawlArgs crawlArgs) {
        Source source = crawlArgs.source;
        Thread thread = new Thread(new ThreadGroup(Integer.toString(source.id())), () -> {
            configureLogging(crawlArgs.workingDir, Crawl.id(crawlArgs.source));
            new Crawl(crawlArgs, new ACLEDCommitter(this.articleDAO, source, this.sourceListDAO, true, true), this.reporter, getSitemaps(source)).run();
        });
        AtomicReference atomicReference = new AtomicReference();
        thread.setUncaughtExceptionHandler((thread2, th) -> {
            atomicReference.set(th);
        });
        thread.start();
        try {
            thread.join();
            if (atomicReference.get() != null) {
                throw new RuntimeException((Throwable) atomicReference.get());
            }
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }

    private void configureLogging(Path path, String str) {
        try {
            LogManager.setRepositorySelector(new DefaultRepositorySelector(new CustomLoggerRepository(new RootLogger(Level.DEBUG), path)), new Object());
        } catch (IllegalArgumentException e) {
        }
        CustomLoggerRepository.register(Thread.currentThread().getThreadGroup().getName(), str);
    }

    public Set<String> recentSitemapURLs(String str, List<String> list) {
        StandardSitemapResolverFactory standardSitemapResolverFactory = new StandardSitemapResolverFactory();
        standardSitemapResolverFactory.setLenient(true);
        standardSitemapResolverFactory.setFrom(LocalDateTime.now().minus(3L, (TemporalUnit) ChronoUnit.DAYS).toInstant(ZoneOffset.UTC).toEpochMilli());
        HttpCrawlerConfig httpCrawlerConfig = new HttpCrawlerConfig();
        httpCrawlerConfig.setId(Util.getID(str));
        httpCrawlerConfig.setWorkDir(Paths.get("sitemap-check", new String[0]).toFile());
        final ISitemapResolver createSitemapResolver = standardSitemapResolverFactory.createSitemapResolver(httpCrawlerConfig, false);
        CloseableHttpClient build = HttpClientBuilder.create().build();
        final HashSet hashSet = new HashSet();
        createSitemapResolver.resolveSitemaps(build, str, (String[]) list.toArray(new String[0]), new SitemapURLAdder() { // from class: com.casm.acled.crawler.spring.CrawlService.1
            public void add(HttpCrawlData httpCrawlData) {
                if (!httpCrawlData.getReference().isEmpty()) {
                    hashSet.add(httpCrawlData.getReference());
                }
                if (hashSet.size() > 10) {
                    createSitemapResolver.stop();
                }
            }
        }, true);
        createSitemapResolver.stop();
        return hashSet;
    }

    public Map<String, String> getRobots(String str) {
        WebTarget path = ClientBuilder.newClient().target(str).path("robots.txt");
        path.property("jersey.config.client.followRedirects", Boolean.TRUE);
        logger.info(str + "/robots.txt");
        try {
            return parseRobots((String) path.request(new String[]{"text/plain"}).get(String.class));
        } catch (ProcessingException | WebApplicationException e) {
            logger.warn(e.getMessage());
            return new HashMap();
        }
    }

    public Map<String, String> parseRobots(String str) {
        HashMap hashMap = new HashMap();
        for (String str2 : str.split("\n")) {
            int indexOf = str2.indexOf(":");
            String substring = indexOf > 0 ? str2.substring(0, indexOf) : "_default";
            String trim = str2.substring(indexOf + 1).trim();
            if (hashMap.containsKey(substring)) {
                trim = ((String) hashMap.get(substring)) + "," + trim;
            }
            hashMap.put(substring, trim);
        }
        return hashMap;
    }

    public Map<String, List<String>> getSitemaps(SourceList sourceList) {
        HashMap hashMap = new HashMap();
        for (Source source : this.sourceDAO.byList(sourceList)) {
            hashMap.put((String) source.get("STANDARD_NAME"), getSitemaps(source));
        }
        return hashMap;
    }

    private List<String> checkStandardLocs(String str) {
        Client newClient = ClientBuilder.newClient();
        ArrayList arrayList = new ArrayList();
        for (String str2 : STANDARD_SITEMAP_LOCS) {
            WebTarget path = newClient.target(str).path(str2);
            path.property("jersey.config.client.followRedirects", Boolean.TRUE);
            try {
                path.request(new String[]{"application/xml"}).get(String.class);
                logger.info(str + "/" + str2);
                arrayList.add(str + "/" + str2);
            } catch (ProcessingException | WebApplicationException e) {
                logger.warn(str + "/" + str2 + " : " + e.getMessage());
            }
        }
        return arrayList;
    }

    public List<String> getSitemaps3(Source source) {
        return Arrays.asList(new StandardRobotsTxtProvider().getRobotsTxt(HttpClientBuilder.create().build(), (String) source.get("LINK"), this.configService.userAgent()).getSitemapLocations());
    }

    public List<String> getSitemaps(Source source) {
        String followRedirects = followRedirects(Util.ensureHTTP((String) source.get("LINK"), false));
        HashSet hashSet = new HashSet();
        SitemapParser sitemapParser = new SitemapParser();
        sitemapParser.setUserAgent(this.configService.userAgent());
        try {
            hashSet.addAll(sitemapParser.getSitemapLocations(followRedirects));
        } catch (InvalidSitemapUrlException e) {
            logger.warn(e.getMessage(), e);
        }
        if (hashSet.isEmpty()) {
            hashSet.addAll((Collection) STANDARD_SITEMAP_LOCS.stream().map(str -> {
                return followRedirects + (followRedirects.endsWith("/") ? "" : "/") + str;
            }).collect(Collectors.toList()));
        }
        return checkURLs(new ArrayList(hashSet));
    }

    public List<String> checkURLs(List<String> list) {
        Client newClient = ClientBuilder.newClient();
        ArrayList arrayList = new ArrayList();
        for (String str : list) {
            if (checkURL(str, newClient)) {
                arrayList.add(str);
            }
        }
        return arrayList;
    }

    public boolean checkURL(String str, Client client) {
        try {
            client.target(str).request().get(String.class);
            return true;
        } catch (WebApplicationException e) {
            return false;
        }
    }

    public boolean checkURL(String str) {
        return checkURL(str, ClientBuilder.newClient());
    }

    public List<String> getSitemaps2(Source source) {
        String str = (String) source.get("LINK");
        if (str == null || str.isEmpty()) {
            logger.warn("empty URL {}", (String) source.get("STANDARD_NAME"));
            return new ArrayList();
        }
        String followRedirects = followRedirects(Util.ensureHTTP(str, false));
        List<String> checkStandardLocs = checkStandardLocs(followRedirects);
        if (checkStandardLocs.isEmpty()) {
            Map<String, String> robots = getRobots(followRedirects);
            if (robots.containsKey(SITEMAP)) {
                checkStandardLocs.addAll(Arrays.asList(robots.get(SITEMAP).split(",")));
            }
        }
        return checkStandardLocs;
    }

    public String followRedirects(String str) {
        try {
            ClientBuilder.newClient().target(str).request().get(String.class);
            return str;
        } catch (WebApplicationException e) {
            if (e.getResponse().getStatus() >= 300 && e.getResponse().getStatus() < 400) {
                return followRedirects((String) e.getResponse().getHeaders().getFirst("Location"));
            }
            logger.warn(str + " : " + e.getMessage());
            return str;
        } catch (ProcessingException e2) {
            logger.warn(str + " : " + e2.getMessage());
            return str;
        }
    }
}
