package eu.dnetlib.data.utility.resource_discovery.crawler;

import eu.dnetlib.data.utility.resource_discovery.crawler.config.Configs;
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
import java.io.IOException;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Vector;
import net.matuschek.http.HttpException;
import net.matuschek.http.URLLogger;
import net.matuschek.spider.WebRobot;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/* loaded from: input_file:WEB-INF/lib/dnet-resource-discovery-2.0.0.jar:eu/dnetlib/data/utility/resource_discovery/crawler/Crawler.class */
public class Crawler {
    private static final Log logger = LogFactory.getLog(Crawler.class);
    private WebRobot crawler = new WebRobot();

    public Crawler() throws IOException, HttpException {
        Configs.configureCrawler(this.crawler);
    }

    public Crawler(boolean z) throws IOException, HttpException {
        if (z) {
            Configs.configureCrawlerForValidation(this.crawler);
        } else {
            Configs.configureCrawler(this.crawler);
        }
    }

    public void reconfigureForRetry() {
        this.crawler.setMaxDepth(2);
    }

    public Vector<String> getLinks(String str) throws MalformedURLException, IOException, InterruptedException {
        logger.debug("Retrieving links from url " + str);
        this.crawler.setStartURL(new URL(UrlFilter.resolveRedirections(str)));
        StringWriter stringWriter = new StringWriter();
        this.crawler.setDocManager(new URLLogger(stringWriter));
        this.crawler.run();
        String[] split = stringWriter.getBuffer().toString().split("\n");
        Vector<String> vector = new Vector<>();
        for (String str2 : split) {
            vector.add(str2);
        }
        return vector;
    }

    public static void main(String[] strArr) {
        try {
            Crawler crawler = new Crawler();
            System.out.println(crawler.crawler.getAllowWholeHost() + " " + crawler.crawler.getAllowWholeDomain());
            try {
                System.out.println(crawler.getLinks("http://www.di.uoa.gr/gr"));
            } catch (Exception e) {
                System.err.println("ERROR: Crawler could not retrieve links from url http://www.di.uoa.gr/gr");
                System.err.println(e.getLocalizedMessage());
                e.printStackTrace();
            }
        } catch (Exception e2) {
            System.err.println("FATAL ERROR: Crawler could not be configured. Please check your robot.xml parameters and try again.");
            System.err.println(e2.getLocalizedMessage());
            e2.printStackTrace();
        }
    }

    public WebRobot getCrawler() {
        return this.crawler;
    }
}
