package eu.openaire.publications_retriever.crawler;

import ch.qos.logback.classic.Level;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/openaire/publications_retriever/crawler/MachineLearning.class */
public class MachineLearning {
    public static final boolean useMLA = false;
    private static final float leastSuccessPercentageForMLA = 51.0f;
    private static final ConcurrentHashMap<String, Integer> timesDomainsFailedInMLA;
    private static final int timesToFailBeforeBlockedFromMLA = 10;
    private static final List<Double> successRateList;
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) MachineLearning.class);
    private static int latestMLADocUrlsFound = 0;
    private static int urlsToGatherBeforeStarting = Level.TRACE_INT;
    private static int leastNumOfUrlsToCheckBeforeAccuracyTest = 1000;
    private static int urlsToWaitUntilRestartMLA = 30000;
    private static boolean mlaStarted = false;
    private static int endOfSleepNumOfUrls = 0;
    private static int latestSuccessBreakPoint = 0;
    private static int latestUrlsMLAChecked = 0;
    public static final AtomicInteger timesGatheredData = new AtomicInteger(0);
    private static final AtomicInteger pageUrlsCheckedWithMLA = new AtomicInteger(0);
    private static boolean isInSleepMode = false;
    public static AtomicInteger totalPagesReachedMLAStage = new AtomicInteger(0);
    public static final SetMultimap<String, String> successPathsHashMultiMap = Multimaps.synchronizedSetMultimap(HashMultimap.create());
    public static final ConcurrentHashMap<String, String> successDocPathsExtensionHashMap = new ConcurrentHashMap<>();
    public static AtomicInteger docUrlsFoundByMLA = new AtomicInteger(0);
    private static final Set<String> domainsBlockedFromMLA = Collections.newSetFromMap(new ConcurrentHashMap());

    public MachineLearning() {
        logger.debug("Initializing the MLA..");
        long j = LoaderAndChecker.useIdUrlPairs ? (long) (FileUtils.numOfLines * 0.7d) : (long) (FileUtils.numOfLines * 0.85d);
        logger.debug("\"approxNumOfTotalUrlsToCheck\" = " + j);
        int i = (int) (j * 0.1d);
        if (urlsToGatherBeforeStarting > i) {
            urlsToGatherBeforeStarting = i;
        }
        logger.debug("\"urlsToGatherBeforeStarting\" = " + urlsToGatherBeforeStarting);
        int i2 = (int) (j * 0.05d);
        if (leastNumOfUrlsToCheckBeforeAccuracyTest < i2) {
            leastNumOfUrlsToCheckBeforeAccuracyTest = i2;
        }
        logger.debug("\"leastNumOfUrlsToCheckBeforeAccuracyTest\" = " + leastNumOfUrlsToCheckBeforeAccuracyTest);
        int i3 = (int) (j * 0.2d);
        if (urlsToWaitUntilRestartMLA > i3) {
            urlsToWaitUntilRestartMLA = i3;
        }
        logger.debug("\"urlsToWaitUntilRestartMLA\" = " + urlsToWaitUntilRestartMLA);
    }

    public static void gatherMLData(String str, String str2, String str3) {
        String pathStr;
        Matcher urlMatcher;
        String pathStr2;
        String docIdStr;
        String group;
        if (str.equals(str2)) {
            return;
        }
        Matcher matcher = null;
        if (str3 == null) {
            Matcher urlMatcher2 = UrlUtils.getUrlMatcher(str);
            matcher = urlMatcher2;
            if (urlMatcher2 == null) {
                return;
            }
            String domainStr = UrlUtils.getDomainStr(str, matcher);
            str3 = domainStr;
            if (domainStr == null) {
                return;
            }
        }
        if (domainsBlockedFromMLA.contains(str3) || (pathStr = UrlUtils.getPathStr(str, matcher)) == null || (urlMatcher = UrlUtils.getUrlMatcher(str2)) == null || (pathStr2 = UrlUtils.getPathStr(str2, urlMatcher)) == null || (docIdStr = UrlUtils.getDocIdStr(str2, urlMatcher)) == null) {
            return;
        }
        Matcher matcher2 = FileUtils.EXTENSION_PATTERN.matcher(docIdStr);
        if (matcher2.find() && (group = matcher2.group(0)) != null) {
            successDocPathsExtensionHashMap.put(pathStr2, group);
        }
        successPathsHashMultiMap.put(pathStr, pathStr2);
        timesGatheredData.incrementAndGet();
    }

    public static double getCurrentSuccessRate() {
        return ((docUrlsFoundByMLA.get() - latestMLADocUrlsFound) * 100.0d) / (pageUrlsCheckedWithMLA.get() - latestUrlsMLAChecked);
    }

    public static synchronized boolean shouldRunPrediction() {
        if (!mlaStarted) {
            if (timesGatheredData.get() <= urlsToGatherBeforeStarting) {
                latestSuccessBreakPoint = urlsToGatherBeforeStarting;
                return false;
            }
            mlaStarted = true;
            logger.info("Starting the MLA..");
        }
        if (isInSleepMode) {
            if (totalPagesReachedMLAStage.get() <= endOfSleepNumOfUrls) {
                return false;
            }
            logger.debug("MLA's \"sleepMode\" is finished, it will now restart.");
            isInSleepMode = false;
            return true;
        }
        long j = latestSuccessBreakPoint + leastNumOfUrlsToCheckBeforeAccuracyTest + endOfSleepNumOfUrls;
        if (totalPagesReachedMLAStage.get() < j) {
            return true;
        }
        double currentSuccessRate = getCurrentSuccessRate();
        logger.debug("Breakpoint (urlNum=" + j + ") reached. Current round's success rate of MLA = " + PublicationsRetriever.df.format(currentSuccessRate) + "%");
        successRateList.add(Double.valueOf(currentSuccessRate));
        if (currentSuccessRate >= 51.0d) {
            endOfSleepNumOfUrls = 0;
            latestSuccessBreakPoint = totalPagesReachedMLAStage.get();
            return true;
        }
        logger.debug("MLA's success-rate is lower than the satisfying one (51.0). Entering \"sleep-mode\", but continuing to gather ML-data...");
        endOfSleepNumOfUrls = totalPagesReachedMLAStage.get() + urlsToWaitUntilRestartMLA;
        latestMLADocUrlsFound = docUrlsFoundByMLA.get();
        latestUrlsMLAChecked = pageUrlsCheckedWithMLA.get();
        latestSuccessBreakPoint++;
        isInSleepMode = true;
        return false;
    }

    public static boolean predictInternalDocUrl(String str, String str2, String str3, String str4, HashSet<String> hashSet) {
        String pathStr;
        Set<String> set;
        int size;
        if (domainsBlockedFromMLA.contains(str4)) {
            logger.debug("Avoiding the MLA-prediction for incompatible domain: \"" + str4 + "\".");
            return false;
        }
        Matcher urlMatcher = UrlUtils.getUrlMatcher(str3);
        if (urlMatcher == null || (pathStr = UrlUtils.getPathStr(str3, urlMatcher)) == null || (size = (set = successPathsHashMultiMap.get((SetMultimap<String, String>) pathStr)).size()) == 0) {
            return false;
        }
        if (size > 5) {
            logger.warn("Domain: \"" + str4 + "\" was blocked from being accessed again by the MLA, after retrieving a proved-to-be incompatible pagePath (having more than 5 possible docUrl-paths).");
            domainsBlockedFromMLA.add(str4);
            successPathsHashMultiMap.removeAll((Object) pathStr);
            return false;
        }
        String docIdStr = UrlUtils.getDocIdStr(str3, urlMatcher);
        if (docIdStr == null) {
            return false;
        }
        if (UrlTypeChecker.PLAIN_PAGE_EXTENSION_FILTER.matcher(docIdStr.toLowerCase()).matches()) {
            docIdStr = FileUtils.EXTENSION_PATTERN.matcher(docIdStr).replaceAll("");
        }
        pageUrlsCheckedWithMLA.incrementAndGet();
        StringBuilder sb = new StringBuilder(300);
        for (String str5 : set) {
            sb.append(str5).append(docIdStr);
            String str6 = successDocPathsExtensionHashMap.get(str5);
            if (str6 != null) {
                sb.append(str6);
            }
            String sb2 = sb.toString();
            sb.setLength(0);
            if (hashSet.contains(sb2)) {
                logger.debug("Found a \"predictedDocUrl\" which exists in the \"currentPageLinks\": " + sb2);
                if (UrlUtils.docOrDatasetUrlsWithIDs.containsKey(sb2)) {
                    logger.info("MachineLearningAlgorithm got a hit for pageUrl: \"" + str3 + "\"! Resulted (already found before) docUrl was: \"" + sb2 + "\"");
                    ConnSupportUtils.handleReCrossedDocUrl(str, str2, str3, sb2, false);
                    docUrlsFoundByMLA.incrementAndGet();
                    return true;
                }
                try {
                    logger.debug("Going to connect & check predictedDocUrl: \"" + sb2 + "\", made out from pageUrl: \"" + str3 + "\"");
                    if (HttpConnUtils.connectAndCheckMimeType(str, str2, str3, sb2, null, false, true)) {
                        logger.info("MachineLearningAlgorithm got a hit for pageUrl: \"" + str3 + "\"! Resulted docUrl was: \"" + sb2 + "\"");
                        docUrlsFoundByMLA.incrementAndGet();
                        return true;
                    }
                    logger.debug("The predictedDocUrl was not a valid docUrl: \"" + sb2 + "\"");
                } catch (Exception e) {
                }
            }
        }
        if (!ConnSupportUtils.countAndBlockDomainAfterTimes(domainsBlockedFromMLA, timesDomainsFailedInMLA, str4, 10, false)) {
            return false;
        }
        logger.warn("Domain: \"" + str4 + "\" was blocked from being accessed again by the MLA, after proved to be incompatible 10 times.");
        Iterator<String> it = successPathsHashMultiMap.get((SetMultimap<String, String>) pathStr).iterator();
        while (it.hasNext()) {
            successDocPathsExtensionHashMap.remove(it.next());
        }
        successPathsHashMultiMap.removeAll((Object) pathStr);
        return false;
    }

    public static double getAverageSuccessRate() {
        int size = successRateList.size();
        if (size == 0) {
            return getCurrentSuccessRate();
        }
        double d = 0.0d;
        Collections.sort(successRateList);
        Iterator<Double> it = successRateList.iterator();
        while (it.hasNext()) {
            d += it.next().doubleValue();
        }
        return d / size;
    }

    public static void printGatheredData() {
        logger.debug("Here is the MLA data gathered throughout the program's execution:");
        Set<String> keySet = successPathsHashMultiMap.keySet();
        logger.debug("Data was gathered and accepted for " + keySet.size() + " docPagePaths:");
        for (String str : keySet) {
            logger.debug("\nDocPagePath: " + str + "\n\tdocUrlPaths:");
            Iterator<String> it = successPathsHashMultiMap.get((SetMultimap<String, String>) str).iterator();
            while (it.hasNext()) {
                logger.debug("\tDocUrlPath: " + it.next());
            }
        }
    }

    static {
        domainsBlockedFromMLA.add("sciencedirect.com");
        timesDomainsFailedInMLA = new ConcurrentHashMap<>();
        successRateList = Collections.synchronizedList(new ArrayList());
    }
}
