package eu.openaire.publications_retriever.crawler;

import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/openaire/publications_retriever/crawler/MetadataHandler.class */
public class MetadataHandler {
    private static final String metaAccessName = "name=\"DC.(?:Access)?Rights\"";
    private static final String metaAccessContent = "content=\"([^\"]+)\"";
    private static final String metaName = "name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\"";
    private static final String metaContent = "content=\"(http[^\"]+)\"";
    public static Pattern COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS;
    public static Pattern LOCALHOST_DOMAIN_REPLACEMENT_PATTERN;
    public static AtomicInteger numOfProhibitedAccessPagesFound;
    public static AtomicInteger numOfMetaDocUrlsFound;
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) MetadataHandler.class);
    public static final Pattern META_RESTRICTED_ACCESS_RIGHTS = Pattern.compile("<(?:(?i)meta)(?:[^<]*name=\"DC.(?:Access)?Rights\"[^<]*content=\"([^\"]+)\"|[^<]*content=\"([^\"]+)\"[^<]*name=\"DC.(?:Access)?Rights\")[^>]*[/]?>", 2);
    public static final Pattern NO_ACCESS_RIGHTS = Pattern.compile(".*(?:(close[d]?|embargo(?:ed)?|restrict(?:ed)?|metadata(?:\\s|%20|-|_)*only|paid)(?:(?:\\s|%20|-|_)*access)?|(?:no[t]?|není)(?:\\s|%20|-|_)*(?:accessible|přístupná)|inaccessible|(?:acceso(?:\\s|%20|-|_)*)?cerrado).*");
    public static final Pattern META_DOC_URL = Pattern.compile("<meta(?:[^<]*name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\"[^<]*content=\"(http[^\"]+)\"|[^<]*content=\"(http[^\"]+)\"[^<]*name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\")[^>]*[/]?>", 2);

    public static boolean checkAndHandleMetadata(String str, String str2, String str3, String str4, String str5) {
        String message;
        String metaAccessRightsFromHTML = getMetaAccessRightsFromHTML(str5);
        if (metaAccessRightsFromHTML == null) {
            if (logger.isTraceEnabled()) {
                logger.trace("Could not retrieve the metaAccessRights for url \"" + str3 + "\", continue by checking the metaDocUrl..");
            }
        } else if (logger.isTraceEnabled()) {
            logger.trace("metaAccessRights: " + metaAccessRightsFromHTML);
        }
        if (metaAccessRightsFromHTML != null) {
            Matcher matcher = NO_ACCESS_RIGHTS.matcher(metaAccessRightsFromHTML.toLowerCase());
            if (matcher.matches()) {
                String group = matcher.group(1);
                if (group == null || group.isEmpty()) {
                    group = "prohibited";
                }
                logger.debug("The metaAccessRights were found to be \"" + group + "\"! Do not check the metaDocUrl, nor crawl the page!");
                UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its accessRight were '" + group + "'.", null, true, "true", "true", "false", "false", "true", null, "null");
                numOfProhibitedAccessPagesFound.incrementAndGet();
                return true;
            }
        }
        if (!LoaderAndChecker.retrieveDocuments) {
            return false;
        }
        String metaDocUrlFromHTML = getMetaDocUrlFromHTML(str5);
        if (metaDocUrlFromHTML == null) {
            if (!logger.isTraceEnabled()) {
                return false;
            }
            logger.trace("Could not retrieve the metaDocUrl, continue by crawling the page..");
            return false;
        }
        if (logger.isTraceEnabled()) {
            logger.trace("MetaDocUrl: " + metaDocUrlFromHTML);
        }
        if (metaDocUrlFromHTML.equals(str3) || ConnSupportUtils.haveOnlyProtocolDifference(str3, metaDocUrlFromHTML) || ConnSupportUtils.isJustASlashRedirect(str3, metaDocUrlFromHTML)) {
            logger.debug("The metaDocUrl was found to be the same as the pageUrl! Continue by crawling the page..");
            return false;
        }
        if (metaDocUrlFromHTML.contains("{{") || metaDocUrlFromHTML.contains("<?")) {
            if (logger.isTraceEnabled()) {
                logger.trace("The metaDocUrl is a dynamic-link. Abort the process and block the domain of the pageUrl.");
            }
            HttpConnUtils.blacklistedDomains.add(str4);
            logger.warn("Domain: \"" + str4 + "\" was blocked, after giving a dynamic metaDocUrl: " + metaDocUrlFromHTML);
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl was a dynamic-link.", null, true, "true", "true", "false", "false", "false", null, "null");
            PageCrawler.contentProblematicUrls.incrementAndGet();
            return true;
        }
        String lowerCase = metaDocUrlFromHTML.toLowerCase();
        if (UrlTypeChecker.CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER.matcher(lowerCase).matches() || UrlTypeChecker.PLAIN_PAGE_EXTENSION_FILTER.matcher(lowerCase).matches() || UrlTypeChecker.URL_DIRECTORY_FILTER.matcher(lowerCase).matches() || COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS.matcher(lowerCase).matches()) {
            logger.warn("The retrieved metaDocUrl ( " + metaDocUrlFromHTML + " ) is pointing to an unsupported file, continue by crawling the page..");
            return false;
        }
        if (PageCrawler.NON_VALID_DOCUMENT.matcher(lowerCase).matches()) {
            logger.warn("The retrieved metaDocUrl ( " + metaDocUrlFromHTML + " ) is pointing to a false-positive full-text file, avoid crawling the page..!");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl is pointing to a false-positive full-text file.", null, true, "true", "true", "false", "false", "false", null, "null");
            return true;
        }
        String filter = LoaderAndChecker.basicURLNormalizer.filter(metaDocUrlFromHTML);
        if (filter == null) {
            logger.warn("Could not normalize metaDocUrl: " + metaDocUrlFromHTML + " , continue by crawling the page..");
            return false;
        }
        String replaceFirst = LOCALHOST_DOMAIN_REPLACEMENT_PATTERN.matcher(filter).replaceFirst("://" + str4);
        if (UrlUtils.docOrDatasetUrlsWithIDs.containsKey(replaceFirst)) {
            ConnSupportUtils.handleReCrossedDocUrl(str, str2, str3, replaceFirst, false);
            numOfMetaDocUrlsFound.incrementAndGet();
            return true;
        }
        try {
            if (HttpConnUtils.connectAndCheckMimeType(str, str2, str3, replaceFirst, str4, false, true)) {
                numOfMetaDocUrlsFound.incrementAndGet();
                return true;
            }
            logger.warn("The retrieved metaDocUrl was NOT a docUrl (unexpected): " + replaceFirst + " , continue by crawling the page..");
            return false;
        } catch (DomainBlockedException e) {
            String domainStr = UrlUtils.getDomainStr(replaceFirst, null);
            if (domainStr == null || !domainStr.equals(str4)) {
                return false;
            }
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its domain was blocked.", null, true, "true", "true", "false", "false", "false", null, "null");
            return true;
        } catch (Exception e2) {
            if (!(e2 instanceof RuntimeException) || (message = e2.getMessage()) == null || (!message.contains("HTTP 401") && !message.contains("HTTP 403"))) {
                logger.warn("The MetaDocUrl < " + replaceFirst + " > had connectivity or redirection problems! Continue by crawling the page..");
                return false;
            }
            logger.warn("The MetaDocUrl < " + replaceFirst + " > had authorization issues, so further crawling of this page is aborted.");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl had authorization issues.", null, true, "true", "true", "false", "false", "false", null, "null");
            return true;
        }
    }

    public static String getMetaAccessRightsFromHTML(String str) {
        Matcher matcher = META_RESTRICTED_ACCESS_RIGHTS.matcher(str);
        StringBuilder sb = new StringBuilder(500);
        while (matcher.find()) {
            String str2 = null;
            try {
                str2 = matcher.group(1);
            } catch (Exception e) {
                logger.error("", (Throwable) e);
            }
            if (str2 == null) {
                try {
                    str2 = matcher.group(2);
                } catch (Exception e2) {
                    logger.error("", (Throwable) e2);
                }
            }
            if (str2 != null && !str2.startsWith("http") && str2.length() <= 200) {
                sb.append(str2).append(" -- ");
            }
        }
        if (sb.length() == 0) {
            return null;
        }
        return sb.toString();
    }

    public static String getMetaDocUrlFromHTML(String str) {
        Matcher matcher = META_DOC_URL.matcher(str);
        if (!matcher.find()) {
            return null;
        }
        String str2 = null;
        try {
            str2 = matcher.group(1);
        } catch (Exception e) {
            logger.error("", (Throwable) e);
        }
        if (str2 == null) {
            try {
                str2 = matcher.group(2);
            } catch (Exception e2) {
                logger.error("", (Throwable) e2);
            }
        }
        return str2;
    }

    static {
        String str = ".+\\.(?:";
        if (!LoaderAndChecker.retrieveDatasets) {
            str = str + "zip|rar|";
        } else if (!LoaderAndChecker.retrieveDocuments) {
            str = str + "pdf|doc[x]?|";
        }
        String str2 = str + "apk|jpg|png)(?:\\?.+)?$";
        logger.debug("COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS -> REGEX: " + str2);
        COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS = Pattern.compile(str2);
        LOCALHOST_DOMAIN_REPLACEMENT_PATTERN = Pattern.compile("://(?:localhost|127.0.0.1)(?:\\:[\\d]+)?");
        numOfProhibitedAccessPagesFound = new AtomicInteger(0);
        numOfMetaDocUrlsFound = new AtomicInteger(0);
    }
}
