package hwu.elixir.scrape.scraper.examples;

import hwu.elixir.scrape.exceptions.CannotWriteException;
import hwu.elixir.scrape.exceptions.FourZeroFourException;
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
import hwu.elixir.scrape.exceptions.MissingMarkupException;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
import hwu.elixir.utils.Helpers;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import org.apache.http.HttpHost;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:hwu/elixir/scrape/scraper/examples/FileScraper.class */
public class FileScraper extends ScraperFilteredCore {
    private static ArrayList<String> urlsToScrape = new ArrayList<>();
    private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
    private static Logger logger = LoggerFactory.getLogger(FileScraper.class.getName());

    private void readFileList() {
        if (this.properties.getLocationOfSitesFile().equals("") || this.properties.getLocationOfSitesFile() == null) {
            logger.error("Please set property *locationOfSitesFile*");
            shutdown();
            System.exit(-1);
        }
        File file = new File(this.properties.getLocationOfSitesFile());
        if (!file.exists()) {
            logger.error("Cannot find file *" + this.properties.getLocationOfSitesFile() + "*. Please set correct value for locationOfSitesFile");
            shutdown();
            System.exit(-1);
        }
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
            Throwable th = null;
            while (true) {
                try {
                    try {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        }
                        String trim = readLine.trim();
                        if (trim.startsWith(HttpHost.DEFAULT_SCHEME_NAME)) {
                            urlsToScrape.add(trim);
                        }
                    } finally {
                    }
                } finally {
                }
            }
            if (bufferedReader != null) {
                if (0 != 0) {
                    try {
                        bufferedReader.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                } else {
                    bufferedReader.close();
                }
            }
        } catch (Exception e) {
            logger.error("Problem reading sites from file *" + this.properties.getLocationOfSitesFile() + "*.");
            shutdown();
            System.exit(-1);
        }
        logger.info("Read " + urlsToScrape.size() + " urls to scrape from " + this.properties.getLocationOfSitesFile() + ".\n");
    }

    private Elements getSitemapList(String str, String str2) throws IOException {
        Document document = new Document(str);
        Elements elements = new Elements();
        new Elements();
        boolean z = false;
        try {
            int length = str.length();
            logger.info("parse sitemap list");
            if (str.substring(length - 3, length).equalsIgnoreCase(".gz")) {
                logger.info("compressed sitemap");
                document = Helpers.gzipFileDecompression(Jsoup.connect(str).ignoreContentType(true).execute().bodyAsBytes());
            } else {
                document = Jsoup.connect(str).maxBodySize(0).get();
            }
        } catch (IOException e) {
            logger.error("Jsoup parsing exception: " + e.getMessage());
        }
        try {
            elements = document.select(str2);
            z = document.outerHtml().contains("sitemapindex");
            document.outerHtml().contains("urlset");
        } catch (NullPointerException e2) {
            logger.error(e2.getMessage());
        }
        if (z) {
            logger.warn("please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
            document.select("loc");
        }
        return elements;
    }

    private String getURLFromTextLine(String str) {
        return str.substring(0, str.indexOf(","));
    }

    private String getDynamicStaticFlag(String str) {
        return str.substring(str.indexOf(","), str.length()).trim().equalsIgnoreCase("static") ? "static" : str.substring(str.indexOf(","), str.length()).trim().equalsIgnoreCase("dynamic") ? "dynamic" : "unknown";
    }

    private void unscrapedURLsToFile(String str, String str2, String str3, long j) {
        File file = new File(str);
        if (!file.exists()) {
            file.mkdir();
        }
        try {
            PrintWriter printWriter = new PrintWriter(new BufferedWriter(new FileWriter(str2 == null ? str + "/unscraped_" + j + ".txt" : str + "/unscraped_" + str2 + ".txt", true)));
            printWriter.println(str3);
            printWriter.close();
        } catch (Exception e) {
            logger.error("Problem writing to unscraped file for " + str3, (Throwable) e);
            e.printStackTrace();
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    public void scrapeAllUrls() {
        boolean z;
        readFileList();
        long contextCounter = this.properties.getContextCounter();
        String outputFolder = this.properties.getOutputFolder();
        boolean dynamic = this.properties.dynamic();
        Iterator<String> it = urlsToScrape.iterator();
        while (it.hasNext()) {
            String next = it.next();
            boolean z2 = false;
            if (next.indexOf(",") != -1) {
                String dynamicStaticFlag = getDynamicStaticFlag(next);
                if (dynamicStaticFlag.equalsIgnoreCase("static")) {
                    dynamic = false;
                } else if (dynamicStaticFlag.equalsIgnoreCase("dynamic")) {
                    dynamic = true;
                } else if (dynamicStaticFlag.equalsIgnoreCase("unknown")) {
                    dynamic = true;
                }
                next = getURLFromTextLine(next);
            }
            logger.info("Attempting to scrape: " + next);
            if (next.toLowerCase().indexOf("sitemap") != -1) {
                int i = 0;
                int maxLimitScrape = this.properties.getMaxLimitScrape();
                Elements elements = new Elements();
                try {
                    elements = getSitemapList(next, "loc");
                } catch (IOException e) {
                    e.printStackTrace();
                }
                elements.toArray();
                logger.info("Sitemap found in URL: " + next);
                Iterator<Element> it2 = elements.iterator();
                while (true) {
                    if (it2.hasNext()) {
                        Element next2 = it2.next();
                        logger.info("Attempting to scrape: " + next2.text());
                        try {
                            String text = next2.text();
                            this.properties.getOutputFolder();
                            long j = contextCounter;
                            contextCounter = j + 1;
                            z2 = scrape(text, this, null, Long.valueOf(j), Boolean.valueOf(dynamic));
                            z = true;
                        } catch (CannotWriteException e2) {
                            logger.error("Problem writing file for " + next2.text() + " to the " + this.properties.getOutputFolder() + " directory.");
                            unscrapedURLsToFile(outputFolder, null, next2.text(), contextCounter - 1);
                            z = false;
                        } catch (FourZeroFourException e3) {
                            logger.error(next + "returned a 404.");
                            unscrapedURLsToFile(outputFolder, null, next2.text(), contextCounter - 1);
                            z = false;
                        } catch (JsonLDInspectionException e4) {
                            logger.error("The JSON-LD could not be parsed for " + next2.text());
                            unscrapedURLsToFile(outputFolder, null, next2.text(), contextCounter - 1);
                            z = false;
                        } catch (MissingMarkupException e5) {
                            logger.error("Problem obtaining markup from " + next2.text() + ".");
                            unscrapedURLsToFile(outputFolder, null, next2.text(), contextCounter - 1);
                            z = false;
                        }
                        if (z) {
                            displayResult(next2.text(), z2, this.properties.getOutputFolder(), contextCounter - 1);
                        } else {
                            logger.error("URL " + next2.text() + " NOT SCRAPED, added to unscraped list");
                        }
                        i++;
                        if (maxLimitScrape < i) {
                            logger.info("MAX SITEMAP LIMIT REACHED: " + maxLimitScrape);
                            logger.info("Scraping over");
                            break;
                        }
                    }
                }
            } else {
                try {
                    this.properties.getOutputFolder();
                    long j2 = contextCounter;
                    contextCounter = j2 + 1;
                    z2 = scrape(next, null, null, Long.valueOf(j2), Boolean.valueOf(dynamic));
                } catch (CannotWriteException e6) {
                    logger.error("Problem writing file for " + next + " to the " + this.properties.getOutputFolder() + " directory.");
                    unscrapedURLsToFile(outputFolder, null, next, contextCounter);
                } catch (FourZeroFourException e7) {
                    logger.error(next + "returned a 404.");
                    unscrapedURLsToFile(outputFolder, null, next, contextCounter);
                } catch (JsonLDInspectionException e8) {
                    logger.error("The JSON-LD could not be parsed for " + next);
                    unscrapedURLsToFile(outputFolder, null, next, contextCounter);
                } catch (MissingMarkupException e9) {
                    logger.error("Problem obtaining markup from " + next + ".");
                    unscrapedURLsToFile(outputFolder, null, next, contextCounter);
                }
                displayResult(next, z2, this.properties.getOutputFolder(), contextCounter - 1);
            }
        }
        logger.info("Scraping over.");
        this.properties.setContextCounter(contextCounter);
        this.properties.updateConfig();
        shutdown();
    }

    public static void main(String[] strArr) throws FourZeroFourException, JsonLDInspectionException {
        logger.info("*************************** STARTING SCRAPE: " + formatter.format(new Date(System.currentTimeMillis())));
        logger.info("Default charset: " + Charset.defaultCharset());
        new FileScraper().scrapeAllUrls();
        logger.info("*************************** ENDING SCRAPE: " + formatter.format(new Date(System.currentTimeMillis())));
        System.exit(0);
    }
}
