/*
 * Decompiled with CFR 0.152.
 */
package eu.openaire.publications_retriever;

import eu.openaire.publications_retriever.crawler.MetadataHandler;
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.machine_learning.PageStructureMLA;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.file.HtmlFileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.DomainConnectionData;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.signal.SignalUtils;
import eu.openaire.publications_retriever.util.url.GenericUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.DecimalFormat;
import java.time.Duration;
import java.time.Instant;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PublicationsRetriever {
    private static final Logger logger = LoggerFactory.getLogger(PublicationsRetriever.class);
    public static Instant startTime = null;
    public static final DecimalFormat df = new DecimalFormat("0.00");
    public static ExecutorService executor;

    public static void main(String[] args2) {
        logger.info("Calling main method with these args: ");
        for (String arg : args2) {
            logger.info("'" + arg + "'");
        }
        SignalUtils.setSignalHandlers();
        startTime = Instant.now();
        ArgsUtils.parseArgs(args2);
        if (!GenericUtils.checkInternetConnectivity()) {
            FileUtils.closeIO();
            System.exit(-44);
        }
        logger.info("Starting PublicationsRetriever..");
        ConnSupportUtils.setKnownMimeTypes();
        UrlTypeChecker.setRuntimeInitializedRegexes();
        if (ArgsUtils.inputStream == null) {
            ArgsUtils.inputStream = ArgsUtils.inputFromUrl ? ConnSupportUtils.getInputStreamFromInputDataUrl() : new BufferedInputStream(System.in, 0x500000);
        } else {
            try (Stream<String> linesStream = Files.lines(Paths.get(ArgsUtils.inputFileFullPath, new String[0]));){
                FileUtils.numOfLines = linesStream.count();
                logger.info("The numOfLines in the inputFile is " + FileUtils.numOfLines);
            }
            catch (IOException ioe) {
                logger.error("Problem when retrieving the input-\"numOfLines\"!", ioe);
            }
        }
        new FileUtils(ArgsUtils.inputStream, System.out);
        if (ArgsUtils.workerThreadsCount == 0) {
            int availableThreads = Runtime.getRuntime().availableProcessors();
            ArgsUtils.workerThreadsCount = availableThreads *= ArgsUtils.threadsMultiplier;
        }
        logger.info("Use " + ArgsUtils.workerThreadsCount + " worker-threads.");
        executor = Executors.newFixedThreadPool(ArgsUtils.workerThreadsCount);
        try {
            new LoaderAndChecker();
        }
        catch (RuntimeException e) {
            String errorMessage = "There was a serious error! Output data is affected! Exiting..";
            System.err.println(errorMessage);
            logger.error(errorMessage);
            FileUtils.closeIO();
            executor.shutdownNow();
            System.exit(-4);
        }
        logger.info("Shutting down the threads..");
        executor.shutdown();
        try {
            if (!executor.awaitTermination(1L, TimeUnit.MINUTES)) {
                logger.warn("The working threads did not finish on time! Stopping them immediately..");
                executor.shutdownNow();
            }
        }
        catch (SecurityException se) {
            logger.error("Could not shutdown the threads in any way..!", se);
        }
        catch (InterruptedException ie) {
            try {
                executor.shutdownNow();
            }
            catch (SecurityException se) {
                logger.error("Could not shutdown the threads in any way..!", se);
            }
        }
        PublicationsRetriever.showStatistics(startTime);
        FileUtils.closeIO();
    }

    public static void showStatistics(Instant startTime) {
        long inputCheckedUrlNum = 0L;
        long notConnectedIDs = 0L;
        int currentlyLoadedUrls = FileUtils.getCurrentlyLoadedUrls();
        if (LoaderAndChecker.useIdUrlPairs) {
            logger.debug(LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl.get() + " IDs (about " + df.format((double)LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl.get() * 100.0 / (double)LoaderAndChecker.numOfIDs) + "%) had no acceptable sourceUrl.");
            notConnectedIDs = LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl.get() + FileUtils.duplicateIdUrlEntries;
            inputCheckedUrlNum = (long)LoaderAndChecker.numOfIDs - notConnectedIDs;
        } else {
            inputCheckedUrlNum = currentlyLoadedUrls;
            if (FileUtils.skipFirstRow && inputCheckedUrlNum < 0L || !FileUtils.skipFirstRow && inputCheckedUrlNum == 0L) {
                String errorMessage = "\"FileUtils.getCurrentlyLoadedUrls()\" is unexpectedly reporting that no urls were retrieved from input file! Output data may be affected! Exiting..";
                System.err.println(errorMessage);
                logger.error(errorMessage);
                FileUtils.closeIO();
                executor.shutdownNow();
                System.exit(-5);
            }
        }
        if (LoaderAndChecker.useIdUrlPairs && inputCheckedUrlNum < (long)currentlyLoadedUrls) {
            logger.info("Total num of urls (IDs) checked (& connected) from the input was: " + inputCheckedUrlNum + ". The rest " + notConnectedIDs + " urls (about " + df.format((double)notConnectedIDs * 100.0 / (double)LoaderAndChecker.numOfIDs) + "%) belonged to duplicate (" + FileUtils.duplicateIdUrlEntries + ") and/or problematic (" + LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl + ") IDs.");
        } else {
            logger.info("Total num of urls (IDs) checked from the input was: " + inputCheckedUrlNum);
        }
        if (SignalUtils.receivedSIGINT) {
            logger.warn("A SIGINT signal was received, so some of the \"checked-urls\" may have not been actually checked, that's more of a number of the \"loaded-urls\".");
        }
        logger.info("Total " + ArgsUtils.targetUrlType + "s found: " + UrlUtils.sumOfDocUrlsFound + ". That's about: " + df.format((double)UrlUtils.sumOfDocUrlsFound.get() * 100.0 / (double)inputCheckedUrlNum) + "% from the total numOfUrls checked (" + inputCheckedUrlNum + "). The rest were problematic or non-handleable url-cases.");
        if (ArgsUtils.shouldDownloadDocFiles) {
            int numOfStoredDocFiles = 0;
            numOfStoredDocFiles = !ArgsUtils.fileNameType.equals((Object)ArgsUtils.fileNameTypeEnum.numberName) ? FileUtils.numOfDocFiles.get() : FileUtils.numOfDocFile - ArgsUtils.initialNumOfFile;
            logger.info("From which docUrls, we were able to retrieve: " + numOfStoredDocFiles + " distinct docFiles. That's about: " + df.format((double)numOfStoredDocFiles * 100.0 / (double)UrlUtils.sumOfDocUrlsFound.get()) + "%. The un-retrieved docFiles were either belonging to already-found " + ArgsUtils.targetUrlType + "s or they had connection-issues or they had problematic content.");
        }
        logger.debug("The metaDocUrl-handler is responsible for the discovery of " + MetadataHandler.numOfMetaDocUrlsFound + " docUrls (" + df.format((double)MetadataHandler.numOfMetaDocUrlsFound.get() * 100.0 / (double)UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls).");
        logger.debug("The re-crossed " + ArgsUtils.targetUrlType + "s (from all handlers) were " + ConnSupportUtils.reCrossedDocUrls.get() + ". That's about " + df.format((double)ConnSupportUtils.reCrossedDocUrls.get() * 100.0 / (double)UrlUtils.sumOfDocUrlsFound.get()) + "% of the found docUrls.");
        if (ArgsUtils.shouldJustDownloadHtmlFiles) {
            logger.info("Downloaded " + HtmlFileUtils.htmlFilesNum.get() + " HTML files. That's about: " + df.format((double)HtmlFileUtils.htmlFilesNum.get() * 100.0 / (double)inputCheckedUrlNum) + "% from the total numOfUrls checked (" + inputCheckedUrlNum + "). The rest either were not pageUrls or they had various issues.");
        } else {
            logger.debug("The legacy M.L.A. was not enabled.");
            logger.debug("The Structure-M.L.A. is responsible for the discovery of " + PageStructureMLA.structureValidatedDocLinks.get() + " of the " + ArgsUtils.targetUrlType + "s (" + df.format((double)PageStructureMLA.structureValidatedDocLinks.get() * 100.0 / (double)UrlUtils.sumOfDocUrlsFound.get()) + "%).");
            logger.debug("In total, it predicted " + PageStructureMLA.structurePredictedDocLinks.get() + " docLinks, with some of them not leading to a fulltext for various reasons (connection-problem, removed-file, unsupported docType, ect.).");
        }
        logger.debug("About " + df.format((double)LoaderAndChecker.connProblematicUrls.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + LoaderAndChecker.connProblematicUrls.get() + " urls) were pages which had connectivity problems.");
        logger.debug("About " + df.format((double)MetadataHandler.numOfProhibitedAccessPagesFound.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + MetadataHandler.numOfProhibitedAccessPagesFound.get() + " urls) were pages with prohibited access.");
        logger.debug("About " + df.format((double)UrlTypeChecker.pagesNotProvidingDocUrls.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.pagesNotProvidingDocUrls.get() + " urls) were pages which did not provide docUrls.");
        logger.debug("About " + df.format((double)UrlTypeChecker.longToRespondUrls.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.longToRespondUrls.get() + " urls) were urls which belong to domains which take too long to respond.");
        logger.debug("About " + df.format((double)PageCrawler.contentProblematicUrls.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + PageCrawler.contentProblematicUrls.get() + " urls) were urls which had problematic content.");
        long problematicUrlsNum = LoaderAndChecker.connProblematicUrls.get() + UrlTypeChecker.pagesNotProvidingDocUrls.get() + UrlTypeChecker.longToRespondUrls.get() + PageCrawler.contentProblematicUrls.get();
        if (!LoaderAndChecker.useIdUrlPairs) {
            logger.debug("About " + df.format((double)UrlTypeChecker.crawlerSensitiveDomains.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.crawlerSensitiveDomains.get() + " urls) were from known crawler-sensitive domains.");
            logger.debug("About " + df.format((double)UrlTypeChecker.javascriptPageUrls.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.javascriptPageUrls.get() + " urls) were from a JavaScript-powered domain, other than the \"sciencedirect.com\", which has dynamic links.");
            logger.debug("About " + df.format((double)UrlTypeChecker.doajResultPageUrls.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.doajResultPageUrls.get() + " urls) were \"doaj.org/toc/\" urls, which are resultPages, thus being avoided to be crawled.");
            logger.debug("About " + df.format((double)UrlTypeChecker.pagesWithHtmlDocUrls.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.pagesWithHtmlDocUrls.get() + " urls) were docUrls, but, in HTML, thus being avoided to be crawled.");
            logger.debug("About " + df.format((double)UrlTypeChecker.pagesRequireLoginToAccessDocFiles.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.pagesRequireLoginToAccessDocFiles.get() + " urls) were of domains which are known to require login to access docFiles, thus, they were blocked before being connected.");
            logger.debug("About " + df.format((double)UrlTypeChecker.pagesWithLargerCrawlingDepth.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.pagesWithLargerCrawlingDepth.get() + " urls) were docPages which have their docUrl deeper inside their server, thus being currently avoided.");
            logger.debug("About " + df.format((double)UrlTypeChecker.pangaeaUrls.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.pangaeaUrls + " urls) were \"PANGAEA.\" with invalid form and non-docUrls in their internal links.");
            logger.debug("About " + df.format((double)UrlTypeChecker.urlsWithUnwantedForm.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + UrlTypeChecker.urlsWithUnwantedForm.get() + " urls) were urls which are plain-domains, have unwanted url-extensions, ect...");
            logger.debug("About " + df.format((double)LoaderAndChecker.inputDuplicatesNum.get() * 100.0 / (double)inputCheckedUrlNum) + "% (" + LoaderAndChecker.inputDuplicatesNum.get() + " urls) were duplicates in the input file.");
            problematicUrlsNum += (long)(UrlTypeChecker.crawlerSensitiveDomains.get() + UrlTypeChecker.javascriptPageUrls.get() + UrlTypeChecker.doajResultPageUrls.get() + UrlTypeChecker.pagesWithHtmlDocUrls.get() + UrlTypeChecker.pagesRequireLoginToAccessDocFiles.get() + UrlTypeChecker.pagesWithLargerCrawlingDepth.get() + UrlTypeChecker.pangaeaUrls.get() + UrlTypeChecker.urlsWithUnwantedForm.get() + LoaderAndChecker.inputDuplicatesNum.get());
        }
        logger.info("From the " + inputCheckedUrlNum + " urls checked from the input, the " + problematicUrlsNum + " of them (about " + df.format((double)problematicUrlsNum * 100.0 / (double)inputCheckedUrlNum) + "%) were problematic (sum of all of the cases that appear in debug-mode).");
        long remainingNonProblematicUrls = inputCheckedUrlNum + (long)LoaderAndChecker.loadingRetries.get() - (long)UrlUtils.sumOfDocUrlsFound.get() - problematicUrlsNum;
        if (remainingNonProblematicUrls > 0L) {
            int failedTasks = LoaderAndChecker.totalNumFailedTasks.get();
            if (failedTasks > 0) {
                logger.info("The remaining " + (remainingNonProblematicUrls -= (long)failedTasks) + " urls either did not provide a fulltext or their status is unknown since " + failedTasks + " of them failed.");
            } else {
                logger.info("The remaining " + remainingNonProblematicUrls + " urls did not provide a fulltext.");
            }
        }
        logger.debug("The number of offline-redirects to HTTPS (reducing the online-redirection-overhead), was: " + HttpConnUtils.timesDidOfflineHTTPSredirect.get());
        logger.debug("The number of offline-redirects to slash-ending url (reducing the online-redirection-overhead), was: " + HttpConnUtils.timesDidOfflineSlashRedirect.get());
        logger.debug("The number of contentTypes which were extracted from the body of http-responses was: " + ConnSupportUtils.numContentTypeExtractedFromPageContent.get());
        logger.debug("The number of domains blocked due to an \"SSL Exception\", was: " + HttpConnUtils.numOfDomainsBlockedDueToSSLException.get());
        logger.debug("The number of domains blocked in total, during runtime, was: " + HttpConnUtils.blacklistedDomains.size());
        logger.debug("The number of paths blocked -due to HTTP 403- in total, was: " + ConnSupportUtils.domainsMultimapWithPaths403BlackListed.values().size());
        PublicationsRetriever.calculateAndPrintElapsedTime(startTime, Instant.now(), null);
        logger.debug("Used " + ArgsUtils.workerThreadsCount + " worker threads.");
        if (logger.isDebugEnabled()) {
            LinkedList<Map.Entry<String, DomainConnectionData>> list = new LinkedList<Map.Entry<String, DomainConnectionData>>(ConnSupportUtils.domainsWithConnectionData.entrySet());
            Comparator<Map.Entry> comparator = Comparator.comparingInt(o -> ((DomainConnectionData)o.getValue()).getTimesConnected());
            list.sort(comparator.reversed());
            logger.debug(list.size() + " domains : timesConnected");
            for (Map.Entry entry : list) {
                logger.debug((String)entry.getKey() + " : " + ((DomainConnectionData)entry.getValue()).getTimesConnected());
            }
        }
    }

    public static void calculateAndPrintElapsedTime(Instant startTime, Instant finishTime, String customMessage) {
        long timeElapsedMillis = Duration.between(startTime, finishTime).toMillis();
        double timeElapsedSecs = (double)timeElapsedMillis / 1000.0;
        long secs = (long)Math.floor(timeElapsedSecs);
        long remainingMillis = (long)((timeElapsedSecs - (double)secs) * 1000.0);
        Object millisMessage = "";
        millisMessage = secs > 0L && remainingMillis > 0L ? " and " + remainingMillis + " milliseconds." : timeElapsedMillis + " milliseconds.";
        double timeElapsedMins = (double)secs / 60.0;
        long mins = (long)Math.floor(timeElapsedMins);
        long remainingSeconds = (long)((timeElapsedMins - (double)mins) * 60.0);
        Object secondsMessage = "";
        if (remainingSeconds > 0L) {
            secondsMessage = remainingSeconds + " seconds";
        }
        double timeElapsedHours = (double)mins / 60.0;
        long hours = (long)Math.floor(timeElapsedHours);
        long remainingMinutes = (long)((timeElapsedHours - (double)hours) * 60.0);
        Object minutesMessage = "";
        if (remainingMinutes > 0L) {
            minutesMessage = remainingMinutes + " minutes, ";
        }
        double timeElapsedDays = (double)hours / 24.0;
        long days = (long)Math.floor(timeElapsedDays);
        long remainingHours = (long)((timeElapsedDays - (double)days) * 24.0);
        Object hoursMessage = "";
        if (remainingHours > 0L) {
            hoursMessage = remainingHours + " hours, ";
        }
        Object daysMessage = "";
        if (days > 0L) {
            daysMessage = days + " days, ";
        }
        logger.info((customMessage != null ? customMessage : "The program finished after: ") + (String)daysMessage + (String)hoursMessage + (String)minutesMessage + (String)secondsMessage + (String)millisMessage);
    }

    public static void sortConcurrentHashMapByValueAndPrint(ConcurrentHashMap<String, Integer> table, boolean descendingOrder) {
        LinkedList<Map.Entry<String, Integer>> list = new LinkedList<Map.Entry<String, Integer>>(table.entrySet());
        list.sort((o1, o2) -> {
            if (descendingOrder) {
                return ((Integer)o2.getValue()).compareTo((Integer)o1.getValue());
            }
            return ((Integer)o1.getValue()).compareTo((Integer)o2.getValue());
        });
        logger.debug("The " + list.size() + " domains which gave " + ArgsUtils.targetUrlType + "s and their number:");
    }
}

