package eu.openaire.publications_retriever.crawler;

import ch.qos.logback.classic.spi.CallerData;
import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DocLinkUnavailableException;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.class */
public class SpecialUrlsHandler {
    private static final String europepmcPageUrlBasePath = "https://europepmc.org/backend/ptpmcrender.fcgi?accid=";
    private static final String nasaBaseDomainPath = "https://ntrs.nasa.gov/";
    private static final String ieeexploreBasePath = "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=";
    private static final String ijcseonlineBaseUrl = "https://www.ijcseonline.org/pub_paper/";
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) SpecialUrlsHandler.class);
    public static Pattern Turkjgastroenterol_docUrl_pattern = Pattern.compile("<div[\\s]*>[\\s]*(/content/files/[^<>]+.pdf)[\\s]*</div>");
    private static final Pattern IJCSEONLINE_PDF_FILENAME = Pattern.compile(".+/[^/]+&(.+)$");
    private static final Pattern ONLINELIBRARY_WILEY = Pattern.compile("(?:http[s]?)://[^/]*onlinelibrary.wiley.com/([^/]+/)?doi/.*");
    private static final Pattern DOI_URL_WITH_INNER_LINK = Pattern.compile("http[s]?://(?:dx.)?doi.org/(http.*)");

    public static String checkAndHandleSpecialUrls(String str) throws RuntimeException {
        String checkAndHandleDergipark;
        String checkAndGetEuropepmcDocUrl = checkAndGetEuropepmcDocUrl(str);
        if (checkAndGetEuropepmcDocUrl != null) {
            checkAndHandleDergipark = checkAndGetEuropepmcDocUrl;
        } else {
            String checkAndDowngradeManuscriptElsevierUrl = checkAndDowngradeManuscriptElsevierUrl(str);
            if (checkAndDowngradeManuscriptElsevierUrl != null) {
                checkAndHandleDergipark = checkAndDowngradeManuscriptElsevierUrl;
            } else {
                String checkAndGetNasaDocUrl = checkAndGetNasaDocUrl(str);
                if (checkAndGetNasaDocUrl != null) {
                    checkAndHandleDergipark = checkAndGetNasaDocUrl;
                } else {
                    String checkAndGetFrontiersinDocUrl = checkAndGetFrontiersinDocUrl(str);
                    if (checkAndGetFrontiersinDocUrl != null) {
                        checkAndHandleDergipark = checkAndGetFrontiersinDocUrl;
                    } else {
                        String checkAndHandlePsyarxiv = checkAndHandlePsyarxiv(str);
                        if (checkAndHandlePsyarxiv != null) {
                            checkAndHandleDergipark = checkAndHandlePsyarxiv;
                        } else {
                            String checkAndHandleIjcseonlinePage = checkAndHandleIjcseonlinePage(str);
                            if (checkAndHandleIjcseonlinePage != null) {
                                checkAndHandleDergipark = checkAndHandleIjcseonlinePage;
                            } else {
                                String checkAndHandleIeeeExplorer = checkAndHandleIeeeExplorer(str);
                                if (checkAndHandleIeeeExplorer != null) {
                                    checkAndHandleDergipark = checkAndHandleIeeeExplorer;
                                } else {
                                    String checkAndHandleOSFurls = checkAndHandleOSFurls(str);
                                    if (checkAndHandleOSFurls != null) {
                                        checkAndHandleDergipark = checkAndHandleOSFurls;
                                    } else {
                                        String checkAndHandleWileyUrls = checkAndHandleWileyUrls(str);
                                        if (checkAndHandleWileyUrls != null) {
                                            checkAndHandleDergipark = checkAndHandleWileyUrls;
                                        } else {
                                            String checkAndHandleScieloUrls = checkAndHandleScieloUrls(str);
                                            if (checkAndHandleScieloUrls != null) {
                                                checkAndHandleDergipark = checkAndHandleScieloUrls;
                                            } else {
                                                String checkAndHandleDoiUrlsWithInnerLinks = checkAndHandleDoiUrlsWithInnerLinks(str);
                                                checkAndHandleDergipark = checkAndHandleDoiUrlsWithInnerLinks != null ? checkAndHandleDoiUrlsWithInnerLinks : checkAndHandleDergipark(str);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        return checkAndHandleDergipark;
    }

    public static String checkAndGetEuropepmcDocUrl(String str) {
        if (!str.contains("europepmc.org") || str.contains("ptpmcrender.fcgi")) {
            return null;
        }
        String docIdStr = UrlUtils.getDocIdStr(str, null);
        if (docIdStr != null) {
            return europepmcPageUrlBasePath + (!docIdStr.startsWith("PMC", 0) ? "PMC" + docIdStr : docIdStr) + "&blobtype=pdf";
        }
        return str;
    }

    public static String checkAndDowngradeManuscriptElsevierUrl(String str) {
        if (str.contains("manuscript.elsevier.com")) {
            return StringUtils.replace(str, "https", "http", 1);
        }
        return null;
    }

    public static String checkAndGetNasaDocUrl(String str) {
        if (!str.contains("ntrs.nasa.gov/citations") || str.contains("api/")) {
            return null;
        }
        String docIdStr = UrlUtils.getDocIdStr(str, null);
        if (docIdStr == null) {
            return str;
        }
        String replace = StringUtils.replace(str, nasaBaseDomainPath, "", 1);
        return "https://ntrs.nasa.gov/api/" + (replace.endsWith("/") ? replace : replace + "/") + "downloads/" + docIdStr + ".pdf";
    }

    public static String checkAndGetFrontiersinDocUrl(String str) {
        if (!str.contains("www.frontiersin.org")) {
            return null;
        }
        if (str.endsWith("/pdf")) {
            return str;
        }
        if (str.contains("/article")) {
            return UrlUtils.getDocIdStr(str, null) == null ? str : str.endsWith("/full") ? StringUtils.replace(str, "/full", "/pdf") : str + "/pdf";
        }
        throw new RuntimeException("This \"frontiersin\"-url is known to not lead to a docUrl: " + str);
    }

    public static String checkAndHandlePsyarxiv(String str) {
        if (!str.contains("psyarxiv.com")) {
            return null;
        }
        if (str.contains("/download")) {
            return str;
        }
        return str + (str.endsWith("/") ? "download" : "/download");
    }

    public static String checkAndHandleDergipark(String str) {
        return StringUtils.replace(str, "dergipark.gov.tr", "dergipark.org.tr");
    }

    public static boolean extractAndCheckTurkjgastroenterolDocUrl(String str, String str2, String str3, String str4, String str5) {
        String filter;
        Matcher matcher = Turkjgastroenterol_docUrl_pattern.matcher(str);
        if (!matcher.find()) {
            UrlUtils.addOutputData(str2, str3, str4, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving the \"turkjgastroenterol\"-pdf-url from its html.", str5, true, "true", "true", "false", "false", "false", null, "null");
            return false;
        }
        try {
            String group = matcher.group(1);
            if (group == null || group.isEmpty()) {
                logger.warn("No pdf-url was found inside the html of page: " + str4);
                UrlUtils.addOutputData(str2, str3, str4, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving the \"turkjgastroenterol\"-pdf-url from its html.", str5, true, "true", "true", "false", "false", "false", null, "null");
                PageCrawler.contentProblematicUrls.incrementAndGet();
                return false;
            }
            String fullyFormedUrl = ConnSupportUtils.getFullyFormedUrl(str4, group, null);
            if (fullyFormedUrl == null || (filter = LoaderAndChecker.basicURLNormalizer.filter(fullyFormedUrl)) == null) {
                logger.warn("Could not normalize url: " + group);
                UrlUtils.addOutputData(str2, str3, str4, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as the retrievied \"turkjgastroenterol\"-pdf-url had normalization's problems.", str5, true, "true", "true", "false", "false", "false", null, "null");
                LoaderAndChecker.connProblematicUrls.incrementAndGet();
                return false;
            }
            String handleUrlChecks = LoaderAndChecker.handleUrlChecks(str2, filter);
            if (handleUrlChecks == null) {
                return false;
            }
            if (UrlUtils.docOrDatasetUrlsWithIDs.containsKey(handleUrlChecks)) {
                ConnSupportUtils.handleReCrossedDocUrl(str2, handleUrlChecks, handleUrlChecks, handleUrlChecks, true);
                return false;
            }
            try {
                HttpConnUtils.connectAndCheckMimeType(str2, handleUrlChecks, handleUrlChecks, handleUrlChecks, null, true, true);
                return true;
            } catch (Exception e) {
                List<String> wasValidAndCouldRetry = LoaderAndChecker.getWasValidAndCouldRetry(e, handleUrlChecks);
                UrlUtils.addOutputData(str2, str3, str4, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem in checking the retrieved 'turkjgastroenterol'-pdf-url: " + wasValidAndCouldRetry.get(2), str5, true, "true", wasValidAndCouldRetry.get(0), "false", "false", wasValidAndCouldRetry.get(1), null, "null");
                return false;
            }
        } catch (Exception e2) {
            logger.warn("No pdf-url was found inside the html of page: " + str4, (Throwable) e2);
            UrlUtils.addOutputData(str2, str3, str4, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving the \"turkjgastroenterol\"-pdf-url from its html.", str5, true, "true", "true", "false", "false", "false", null, "null");
            PageCrawler.contentProblematicUrls.incrementAndGet();
            return false;
        }
    }

    public static void handleAupOnlinePage(String str, Elements elements) throws DocLinkFoundException, DocLinkUnavailableException {
        Iterator<Element> it = elements.iterator();
        while (it.hasNext()) {
            Element next = it.next();
            if (next.attr("data-title").contains("ownload")) {
                String trim = next.attr("action").trim();
                if (!trim.isEmpty()) {
                    throw new DocLinkFoundException(trim);
                }
            }
        }
        throw new DocLinkUnavailableException("No docUrl was found inside a form-element, for \"aup-online.com\" pageUrl: " + str);
    }

    public static String checkAndHandleIjcseonlinePage(String str) {
        if (!str.contains("www.ijcseonline.org")) {
            return null;
        }
        if (!str.contains("pdf_paper_view.php")) {
            return str;
        }
        try {
            Matcher matcher = IJCSEONLINE_PDF_FILENAME.matcher(str);
            if (!matcher.matches()) {
                return str;
            }
            String group = matcher.group(1);
            if (group != null && !group.isEmpty()) {
                return ijcseonlineBaseUrl + group;
            }
            logger.error("No pdf-file-name was extracted from pageUrl: " + str);
            return str;
        } catch (Exception e) {
            logger.error("", (Throwable) e);
            return str;
        }
    }

    public static String checkAndHandleIeeeExplorer(String str) {
        String docIdStr;
        if (!str.contains("ieeexplore.ieee.org")) {
            return null;
        }
        if (!str.contains("/stampPDF/") && (docIdStr = UrlUtils.getDocIdStr(str, null)) != null) {
            return ieeexploreBasePath + docIdStr;
        }
        return str;
    }

    public static String checkAndHandleOSFurls(String str) {
        if (!str.contains("://osf.io")) {
            return null;
        }
        if (str.contains("/download")) {
            return str;
        }
        if (!str.endsWith("/")) {
            str = str + "/";
        }
        return str + "download";
    }

    public static String checkAndHandleWileyUrls(String str) {
        String docIdStr;
        Matcher matcher = ONLINELIBRARY_WILEY.matcher(str);
        if (!matcher.matches()) {
            if (!str.contains("api.wiley.com/onlinelibrary") || (docIdStr = UrlUtils.getDocIdStr(str, null)) == null) {
                return null;
            }
            return "https://onlinelibrary.wiley.com/doi/pdfdirect/" + docIdStr + "?download=true";
        }
        String group = matcher.group(1);
        if (group != null && !group.isEmpty()) {
            str = StringUtils.replace(str, group, "");
        }
        if (str.contains("/pdfdirect/")) {
            if (str.contains("download=true")) {
                return str;
            }
            return str + (str.contains(CallerData.NA) ? "&" : CallerData.NA) + "download=true";
        }
        if (str.endsWith("/abstract")) {
            str = StringUtils.replace(str, "/abstract", "");
        } else if (str.endsWith("/fullpdf")) {
            str = StringUtils.replace(str, "/fullpdf", "");
        }
        String replace = str.contains("epdf/") ? StringUtils.replace(str, "epdf/", "pdfdirect/", 1) : str.contains("pdf/") ? StringUtils.replace(str, "pdf/", "pdfdirect/", 1) : str.contains("full/") ? StringUtils.replace(str, "full/", "pdfdirect/", 1) : str.contains("abs/") ? StringUtils.replace(str, "/doi/abs/", "/doi/pdfdirect/", 1) : str.contains("full-xml/") ? StringUtils.replace(str, "/full-xml/", "/full/", 1) : StringUtils.replace(str, "/doi/", "/doi/pdfdirect/", 1);
        if (replace.contains("download=true")) {
            return replace;
        }
        return replace + (replace.contains(CallerData.NA) ? "&" : CallerData.NA) + "download=true";
    }

    public static String checkAndHandleEmbopressUrls(String str) {
        if (!str.contains("://www.embopress.org")) {
            return null;
        }
        if (!str.contains("/pdf/") && str.contains("/pdfdirect/")) {
            return StringUtils.replace(str, "/pdfdirect/", "/pdf/", 1);
        }
        return str;
    }

    public static String checkAndHandleScieloUrls(String str) {
        if (str.contains("scielo.br")) {
            return StringUtils.replace(str, "amp;", "&");
        }
        return null;
    }

    public static String checkAndHandleDoiUrlsWithInnerLinks(String str) {
        Matcher matcher = DOI_URL_WITH_INNER_LINK.matcher(str);
        if (!matcher.matches()) {
            return null;
        }
        String group = matcher.group(1);
        if (group != null && !group.isEmpty()) {
            return StringUtils.replace(group, ":/", "://");
        }
        logger.warn("Could not extract he inner-link from weird-doi-url: " + str);
        return str;
    }
}
