package eu.openaire.publications_retriever;

import eu.openaire.publications_retriever.crawler.MetadataHandler;
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.DomainConnectionData;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.signal.SignalUtils;
import eu.openaire.publications_retriever.util.url.GenericUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.DecimalFormat;
import java.time.Duration;
import java.time.Instant;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/openaire/publications_retriever/PublicationsRetriever.class */
public class PublicationsRetriever {
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) PublicationsRetriever.class);
    public static Instant startTime = null;
    public static final DecimalFormat df = new DecimalFormat("0.00");
    public static ExecutorService executor;

    public static void main(String[] strArr) {
        logger.info("Calling main method with these args: ");
        for (String str : strArr) {
            logger.info("'" + str + "'");
        }
        SignalUtils.setSignalHandlers();
        startTime = Instant.now();
        ArgsUtils.parseArgs(strArr);
        if (!GenericUtils.checkInternetConnectivity()) {
            FileUtils.closeIO();
            System.exit(-44);
        }
        logger.info("Starting PublicationsRetriever..");
        ConnSupportUtils.setKnownMimeTypes();
        UrlTypeChecker.setURLDirectoryFilterRegex();
        if (ArgsUtils.inputStream != null) {
            try {
                Stream<String> lines = Files.lines(Paths.get(ArgsUtils.inputFileFullPath, new String[0]));
                Throwable th = null;
                try {
                    FileUtils.numOfLines = lines.count();
                    logger.info("The numOfLines in the inputFile is " + FileUtils.numOfLines);
                    if (lines != null) {
                        if (0 != 0) {
                            try {
                                lines.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            lines.close();
                        }
                    }
                } finally {
                }
            } catch (IOException e) {
                logger.error("Problem when retrieving the input-\"numOfLines\"!", (Throwable) e);
            }
        } else if (ArgsUtils.inputFromUrl) {
            ArgsUtils.inputStream = ConnSupportUtils.getInputStreamFromInputDataUrl();
        } else {
            ArgsUtils.inputStream = new BufferedInputStream(System.in, 5242880);
        }
        new FileUtils(ArgsUtils.inputStream, System.out);
        if (ArgsUtils.workerThreadsCount == 0) {
            ArgsUtils.workerThreadsCount = Runtime.getRuntime().availableProcessors() * ArgsUtils.threadsMultiplier;
        }
        logger.info("Use " + ArgsUtils.workerThreadsCount + " worker-threads.");
        executor = Executors.newFixedThreadPool(ArgsUtils.workerThreadsCount);
        try {
            new LoaderAndChecker();
        } catch (RuntimeException e2) {
            System.err.println("There was a serious error! Output data is affected! Exiting..");
            logger.error("There was a serious error! Output data is affected! Exiting..");
            FileUtils.closeIO();
            executor.shutdownNow();
            System.exit(-4);
        }
        logger.info("Shutting down the threads..");
        executor.shutdown();
        try {
            if (!executor.awaitTermination(1L, TimeUnit.MINUTES)) {
                logger.warn("The working threads did not finish on time! Stopping them immediately..");
                executor.shutdownNow();
            }
        } catch (InterruptedException e3) {
            try {
                executor.shutdownNow();
            } catch (SecurityException e4) {
                logger.error("Could not shutdown the threads in any way..!", (Throwable) e4);
            }
        } catch (SecurityException e5) {
            logger.error("Could not shutdown the threads in any way..!", (Throwable) e5);
        }
        showStatistics(startTime);
        FileUtils.closeIO();
    }

    public static void showStatistics(Instant instant) {
        long j;
        long j2 = 0;
        int currentlyLoadedUrls = FileUtils.getCurrentlyLoadedUrls();
        if (LoaderAndChecker.useIdUrlPairs) {
            logger.debug(LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl.get() + " IDs (about " + df.format((LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl.get() * 100.0d) / LoaderAndChecker.numOfIDs) + "%) had no acceptable sourceUrl.");
            j2 = LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl.get() + FileUtils.duplicateIdUrlEntries;
            j = LoaderAndChecker.numOfIDs - j2;
        } else {
            j = currentlyLoadedUrls;
            if ((FileUtils.skipFirstRow && j < 0) || (!FileUtils.skipFirstRow && j == 0)) {
                System.err.println("\"FileUtils.getCurrentlyLoadedUrls()\" is unexpectedly reporting that no urls were retrieved from input file! Output data may be affected! Exiting..");
                logger.error("\"FileUtils.getCurrentlyLoadedUrls()\" is unexpectedly reporting that no urls were retrieved from input file! Output data may be affected! Exiting..");
                FileUtils.closeIO();
                executor.shutdownNow();
                System.exit(-5);
            }
        }
        if (!LoaderAndChecker.useIdUrlPairs || j >= currentlyLoadedUrls) {
            logger.info("Total num of urls (IDs) checked from the input was: " + j);
        } else {
            logger.info("Total num of urls (IDs) checked (& connected) from the input was: " + j + ". The rest " + j2 + " urls (about " + df.format((j2 * 100.0d) / LoaderAndChecker.numOfIDs) + "%) belonged to duplicate (" + FileUtils.duplicateIdUrlEntries + ") and/or problematic (" + LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl + ") IDs.");
        }
        if (SignalUtils.receivedSIGINT) {
            logger.warn("A SIGINT signal was received, so some of the \"checked-urls\" may have not been actually checked, that's more of a number of the \"loaded-urls\".");
        }
        logger.info("Total " + ArgsUtils.targetUrlType + "s found: " + UrlUtils.sumOfDocUrlsFound + ". That's about: " + df.format((UrlUtils.sumOfDocUrlsFound.get() * 100.0d) / j) + "% from the total numOfUrls checked. The rest were problematic or non-handleable url-cases.");
        if (FileUtils.shouldDownloadDocFiles) {
            int i = !FileUtils.docFileNameType.equals(FileUtils.DocFileNameType.numberName) ? FileUtils.numOfDocFiles.get() : FileUtils.numOfDocFile - ArgsUtils.initialNumOfDocFile;
            logger.info("From which docUrls, we were able to retrieve: " + i + " distinct docFiles. That's about: " + df.format((i * 100.0d) / UrlUtils.sumOfDocUrlsFound.get()) + "%. The un-retrieved docFiles were either belonging to already-found docUrls or they had connection-issues.");
        }
        logger.debug("The metaDocUrl-handler is responsible for the discovery of " + MetadataHandler.numOfMetaDocUrlsFound + " docUrls (" + df.format((MetadataHandler.numOfMetaDocUrlsFound.get() * 100.0d) / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls).");
        logger.debug("The re-crossed docUrls (from all handlers) were " + ConnSupportUtils.reCrossedDocUrls.get() + ". That's about " + df.format((ConnSupportUtils.reCrossedDocUrls.get() * 100.0d) / UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls.");
        logger.debug("The M.L.A. was not enabled.");
        logger.debug("About " + df.format((LoaderAndChecker.connProblematicUrls.get() * 100.0d) / j) + "% (" + LoaderAndChecker.connProblematicUrls.get() + " urls) were pages which had connectivity problems.");
        logger.debug("About " + df.format((MetadataHandler.numOfProhibitedAccessPagesFound.get() * 100.0d) / j) + "% (" + MetadataHandler.numOfProhibitedAccessPagesFound.get() + " urls) were pages with prohibited access.");
        logger.debug("About " + df.format((UrlTypeChecker.pagesNotProvidingDocUrls.get() * 100.0d) / j) + "% (" + UrlTypeChecker.pagesNotProvidingDocUrls.get() + " urls) were pages which did not provide docUrls.");
        logger.debug("About " + df.format((UrlTypeChecker.longToRespondUrls.get() * 100.0d) / j) + "% (" + UrlTypeChecker.longToRespondUrls.get() + " urls) were urls which belong to domains which take too long to respond.");
        logger.debug("About " + df.format((PageCrawler.contentProblematicUrls.get() * 100.0d) / j) + "% (" + PageCrawler.contentProblematicUrls.get() + " urls) were urls which had problematic content.");
        long j3 = LoaderAndChecker.connProblematicUrls.get() + UrlTypeChecker.pagesNotProvidingDocUrls.get() + UrlTypeChecker.longToRespondUrls.get() + PageCrawler.contentProblematicUrls.get();
        if (!LoaderAndChecker.useIdUrlPairs) {
            logger.debug("About " + df.format((UrlTypeChecker.crawlerSensitiveDomains.get() * 100.0d) / j) + "% (" + UrlTypeChecker.crawlerSensitiveDomains.get() + " urls) were from known crawler-sensitive domains.");
            logger.debug("About " + df.format((UrlTypeChecker.javascriptPageUrls.get() * 100.0d) / j) + "% (" + UrlTypeChecker.javascriptPageUrls.get() + " urls) were from a JavaScript-powered domain, other than the \"sciencedirect.com\", which has dynamic links.");
            logger.debug("About " + df.format((UrlTypeChecker.doajResultPageUrls.get() * 100.0d) / j) + "% (" + UrlTypeChecker.doajResultPageUrls.get() + " urls) were \"doaj.org/toc/\" urls, which are resultPages, thus being avoided to be crawled.");
            logger.debug("About " + df.format((UrlTypeChecker.pagesWithHtmlDocUrls.get() * 100.0d) / j) + "% (" + UrlTypeChecker.pagesWithHtmlDocUrls.get() + " urls) were docUrls, but, in HTML, thus being avoided to be crawled.");
            logger.debug("About " + df.format((UrlTypeChecker.pagesRequireLoginToAccessDocFiles.get() * 100.0d) / j) + "% (" + UrlTypeChecker.pagesRequireLoginToAccessDocFiles.get() + " urls) were of domains which are known to require login to access docFiles, thus, they were blocked before being connected.");
            logger.debug("About " + df.format((UrlTypeChecker.pagesWithLargerCrawlingDepth.get() * 100.0d) / j) + "% (" + UrlTypeChecker.pagesWithLargerCrawlingDepth.get() + " urls) were docPages which have their docUrl deeper inside their server, thus being currently avoided.");
            logger.debug("About " + df.format((UrlTypeChecker.pangaeaUrls.get() * 100.0d) / j) + "% (" + UrlTypeChecker.pangaeaUrls + " urls) were \"PANGAEA.\" with invalid form and non-docUrls in their internal links.");
            logger.debug("About " + df.format((UrlTypeChecker.urlsWithUnwantedForm.get() * 100.0d) / j) + "% (" + UrlTypeChecker.urlsWithUnwantedForm.get() + " urls) were urls which are plain-domains, have unwanted url-extensions, ect...");
            logger.debug("About " + df.format((LoaderAndChecker.inputDuplicatesNum.get() * 100.0d) / j) + "% (" + LoaderAndChecker.inputDuplicatesNum.get() + " urls) were duplicates in the input file.");
            j3 += UrlTypeChecker.crawlerSensitiveDomains.get() + UrlTypeChecker.javascriptPageUrls.get() + UrlTypeChecker.doajResultPageUrls.get() + UrlTypeChecker.pagesWithHtmlDocUrls.get() + UrlTypeChecker.pagesRequireLoginToAccessDocFiles.get() + UrlTypeChecker.pagesWithLargerCrawlingDepth.get() + UrlTypeChecker.pangaeaUrls.get() + UrlTypeChecker.urlsWithUnwantedForm.get() + LoaderAndChecker.inputDuplicatesNum.get();
        }
        logger.info("From the " + j + " urls checked from the input, the " + j3 + " of them (about " + df.format((j3 * 100.0d) / j) + "%) were problematic (sum of all of the cases that appear in debug-mode).");
        long j4 = ((j + LoaderAndChecker.loadingRetries.get()) - UrlUtils.sumOfDocUrlsFound.get()) - j3;
        if (j4 > 0) {
            int i2 = LoaderAndChecker.totalNumFailedTasks.get();
            if (i2 > 0) {
                logger.info("The remaining " + (j4 - i2) + " urls either did not provide a fulltext or their status is unknown since " + i2 + " of them failed.");
            } else {
                logger.info("The remaining " + j4 + " urls did not provide a fulltext.");
            }
        }
        logger.debug("The number of offline-redirects to HTTPS (reducing the online-redirection-overhead), was: " + HttpConnUtils.timesDidOfflineHTTPSredirect.get());
        logger.debug("The number of offline-redirects to slash-ending url (reducing the online-redirection-overhead), was: " + HttpConnUtils.timesDidOfflineSlashRedirect.get());
        logger.debug("The number of domains blocked due to an \"SSL Exception\", was: " + HttpConnUtils.numOfDomainsBlockedDueToSSLException.get());
        logger.debug("The number of domains blocked in total, during runtime, was: " + HttpConnUtils.blacklistedDomains.size());
        logger.debug("The number of paths blocked -due to HTTP 403- in total, was: " + ConnSupportUtils.domainsMultimapWithPaths403BlackListed.values().size());
        calculateAndPrintElapsedTime(instant, Instant.now(), null);
        logger.debug("Used " + ArgsUtils.workerThreadsCount + " worker threads.");
        if (logger.isDebugEnabled()) {
            LinkedList<Map.Entry> linkedList = new LinkedList(ConnSupportUtils.domainsWithConnectionData.entrySet());
            linkedList.sort(Comparator.comparingInt(entry -> {
                return ((DomainConnectionData) entry.getValue()).getTimesConnected();
            }).reversed());
            logger.debug(linkedList.size() + " domains : timesConnected");
            for (Map.Entry entry2 : linkedList) {
                logger.debug(((String) entry2.getKey()) + " : " + ((DomainConnectionData) entry2.getValue()).getTimesConnected());
            }
        }
    }

    public static void calculateAndPrintElapsedTime(Instant instant, Instant instant2, String str) {
        long millis = Duration.between(instant, instant2).toMillis();
        double d = millis / 1000.0d;
        long floor = (long) Math.floor(d);
        long j = (long) ((d - floor) * 1000.0d);
        String str2 = (floor <= 0 || j <= 0) ? millis + " milliseconds." : " and " + j + " milliseconds.";
        double d2 = floor / 60.0d;
        long floor2 = (long) Math.floor(d2);
        long j2 = (long) ((d2 - floor2) * 60.0d);
        String str3 = j2 > 0 ? j2 + " seconds" : "";
        double d3 = floor2 / 60.0d;
        long floor3 = (long) Math.floor(d3);
        long j3 = (long) ((d3 - floor3) * 60.0d);
        String str4 = j3 > 0 ? j3 + " minutes, " : "";
        double d4 = floor3 / 24.0d;
        long floor4 = (long) Math.floor(d4);
        long j4 = (long) ((d4 - floor4) * 24.0d);
        logger.info((str != null ? str : "The program finished after: ") + (floor4 > 0 ? floor4 + " days, " : "") + (j4 > 0 ? j4 + " hours, " : "") + str4 + str3 + str2);
    }

    public static void sortConcurrentHashMapByValueAndPrint(ConcurrentHashMap<String, Integer> concurrentHashMap, boolean z) {
        LinkedList linkedList = new LinkedList(concurrentHashMap.entrySet());
        linkedList.sort((entry, entry2) -> {
            return z ? ((Integer) entry2.getValue()).compareTo((Integer) entry.getValue()) : ((Integer) entry.getValue()).compareTo((Integer) entry2.getValue());
        });
        logger.debug("The " + linkedList.size() + " domains which gave docUrls and their number:");
    }
}
