package eu.openaire.publications_retriever.util.url;

import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/openaire/publications_retriever/util/url/UrlTypeChecker.class */
public class UrlTypeChecker {
    private static final String htOrPhpExtensionsPattern = "(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)";
    private static final String mediaExtensionsPattern = "ico|gif|jpg|jpeg|png|wav|mp3|mp4|webm|mkv|mov";
    private static final String docOrDatasetKeywords = "(?:file|pdf|document|dataset|article|fulltext)";
    private static final String wordsPattern = "[\\w/_.,-]{0,100}";
    private static final String docOrDatasetNegativeLookAroundPattern = "(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)";
    public static Pattern URL_DIRECTORY_FILTER;
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) UrlTypeChecker.class);
    public static final Pattern CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER = Pattern.compile(".+\\.(?:(?:doc|ppt)[x]?|ps|epub|od[tp]|djvu|rtf)(?:\\?.+)?$");
    public static final Pattern URL_FILE_EXTENSION_FILTER = Pattern.compile(".+\\.(?:css|js(?:\\?y)?|ico|gif|jpg|jpeg|png|wav|mp3|mp4|webm|mkv|mov|pt|bib|nt|refer|enw|ris|mso|dtl|do|asc|c|cc(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)|cxx|cpp|java|py)(?:\\?.+)?$");
    public static final Pattern INTERNAL_LINKS_KEYWORDS_FILTER = Pattern.compile(".*(?:doi.org|\\?l[a]?n[g]?=|isallowed=n|site=|link(?:out|listener)|login).*");
    public static final Pattern PLAIN_PAGE_EXTENSION_FILTER = Pattern.compile(".+(?<!(?:file|pdf|document|dataset|article|fulltext))\\.(?:(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)|[aj]sp[x]?|jsf|do|asc|cgi|cfm)(?:\\?(?!.*(?:file|pdf|document|dataset|article|fulltext)).*)?$");
    public static final Pattern INTERNAL_LINKS_FILE_FORMAT_FILTER = Pattern.compile(".+format=(?:xml|(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)|rss|ris|bib).*");
    public static final Pattern SPECIFIC_DOMAIN_FILTER = Pattern.compile("[^/]+://[^/]*(?<=[/.])(?:(?<!drive.)google\\.|goo.gl|gstatic|facebook|fb.me|twitter|(?:meta|xing|baidu|t|x|vk).co|insta(?:gram|paper)|tiktok|youtube|vimeo|linkedin|ebay|bing|(?:amazon|[./]analytics)\\.|s.w.org|wikipedia|myspace|yahoo|mail|pinterest|reddit|tumblr|www.ccdc.cam.ac.uk|figshare.com/collections/|datadryad.org/stash/dataset/|evernote|skype|microsoft|adobe|buffer|digg|stumbleupon|addthis|delicious|dailymotion|gostats|blog(?:ger)?|copyright|friendfeed|newsvine|telegram|getpocket|flipboard|line.me|ok.rudouban|qzone|renren|weibo|doubleclick|bit.ly|github|reviewofbooks|plu.mx|(?<!files.)wordpress|orcid.org|auth(?:oriz(?:e|ation)|entication)?\\.|(?<!manuscript.)elsevier.com|sciencedirect.com|(?:static|multimedia|tienda).elsevier.|arvojournals.org|books.openedition.org|perfdrive.|services.bepress.com|(?:careers|shop).|myworkdayjobs.com|editorialmanager.com|(tandfonline.com|persee.fr|papers.ssrn.com|documentation.ird.fr|library.unisa.edu.au|publications.cnr.it)|(doaj.org/toc/)|(dlib.org|saberes.fcecon.unr.edu.ar|eumed.net)|(rivisteweb.it|wur.nl|remeri.org.mx|cam.ac.uk|scindeks.ceon.rs|egms.de)|(bibliotecadigital.uel.br|cepr.org)|(scielosp.org(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)|cepr.org|dk.um.si|apospublications.com|jorr.org|rwth-aachen.de|pubmed.ncbi.nlm.nih.gov)|(200.17.137.108))[^/]*/.*");
    public static final Pattern PLAIN_DOMAIN_FILTER = Pattern.compile("[^/]+://[\\w.:-]+(?:/[\\w]{2})?(?:/index.(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?))?[/]?(?:\\?(?:locale(?:-attribute)?|ln)=[\\w_-]+)?$");
    public static AtomicInteger javascriptPageUrls = new AtomicInteger(0);
    public static AtomicInteger crawlerSensitiveDomains = new AtomicInteger(0);
    public static AtomicInteger doajResultPageUrls = new AtomicInteger(0);
    public static AtomicInteger pagesWithHtmlDocUrls = new AtomicInteger(0);
    public static AtomicInteger pagesRequireLoginToAccessDocFiles = new AtomicInteger(0);
    public static AtomicInteger pagesWithLargerCrawlingDepth = new AtomicInteger(0);
    public static AtomicInteger longToRespondUrls = new AtomicInteger(0);
    public static AtomicInteger urlsWithUnwantedForm = new AtomicInteger(0);
    public static AtomicInteger pangaeaUrls = new AtomicInteger(0);
    public static AtomicInteger pagesNotProvidingDocUrls = new AtomicInteger(0);

    public static void setURLDirectoryFilterRegex() {
        URL_DIRECTORY_FILTER = Pattern.compile("[^/]+://.*/(?:(discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)(?!.*paper)|privacy|terms|law|principles|(?:my|your|create)?[-]?account|my(?:dspace|selection|cart)|(?:service|help)[-]?desk|settings|fund|aut[h]?or(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)|journal/key|(?:journal-)?editor|author:|(?<!ntrs.nasa.gov/(?:api/)?)citation|review|external|facets|statistics|application|selfarchive|permission|ethic(s)?/.*/view/|/view/(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)|conta[c]?t|wallet|contribute|donate|our[_-][\\w]+|template|logo|image|photo/|video|advertiser|most-popular|people|(?:the)?press|for-authors|customer-service[s]?|captcha|clipboard|dropdown|widget|(?:forum|blog|column|row|js|css|rss|legal)/|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$|rights[-]?permissions|publication[-]?ethics|advertising|reset[-]?password|\\*/|communit(?:y|ies)|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|(?<!response_type=)cookie|(?:page-)?not[-]?found|(?:404(?:_response)?|accessibility|invalid|catalog(?:ue|ar|o)?)\\.(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)|(.*/view/(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*))|(.*sharedsitesession)|(doi.org/https://doi.org/.*pangaea." + (!LoaderAndChecker.retrieveDatasets ? "|pangaea.)" : ")") + (!LoaderAndChecker.retrieveDatasets ? "|(?:bibtext|dc(?:terms)?|[^/]*(?:tei|endnote))$)" : ").*"));
        if (logger.isTraceEnabled()) {
            logger.trace("URL_DIRECTORY_FILTER:\n" + URL_DIRECTORY_FILTER);
        }
    }

    public static boolean shouldNotAcceptPageUrl(String str, String str2, String str3, String str4, boolean z) {
        String str5;
        String str6;
        if (str4 == null) {
            str4 = str3.toLowerCase();
        }
        Matcher matcher = URL_DIRECTORY_FILTER.matcher(str4);
        if (matcher.matches()) {
            if (!z) {
                return true;
            }
            String group = matcher.group(1);
            if (group == null || group.isEmpty()) {
                String group2 = matcher.group(2);
                if (group2 == null || group2.isEmpty()) {
                    String group3 = matcher.group(3);
                    if (group3 == null || group3.isEmpty()) {
                        str6 = "Discarded after matching to a directory with problems.";
                        UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str6, null, true, "true", "N/A", "false", "false", "false", null, "null");
                    } else {
                        str6 = "Discarded after matching to a 'PANGAEA.' url with invalid form and non-docUrls in their internal links: '" + group3 + "'.";
                        UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str6, null, true, "true", "N/A", "false", "false", "false", null, "null");
                        pangaeaUrls.incrementAndGet();
                    }
                } else {
                    ConnSupportUtils.blockSharedSiteSessionDomains(str3, null);
                    str6 = "It was discarded after participating in a 'sharedSiteSession-endlessRedirectionPack': '" + group2 + "'.";
                    UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str6, null, true, "true", "N/A", "false", "false", "false", null, "null");
                    LoaderAndChecker.connProblematicUrls.incrementAndGet();
                }
            } else {
                str6 = "Discarded after matching to a site having its DocUrls in larger depth: '" + group + "'.";
                UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str6, null, true, "true", "N/A", "false", "false", "false", null, "null");
                pagesWithLargerCrawlingDepth.incrementAndGet();
            }
            logger.debug("Url-\"" + str3 + "\": " + str6);
            return true;
        }
        Matcher matcher2 = SPECIFIC_DOMAIN_FILTER.matcher(str4);
        if (!matcher2.matches()) {
            if (PageCrawler.NON_VALID_DOCUMENT.matcher(str4).matches()) {
                if (!z) {
                    return true;
                }
                logger.debug("Url-\"" + str3 + "\": Discarded after matching to a url leading to an invalid document!");
                UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a url leading to an invalid document!", null, true, "true", "N/A", "false", "false", "false", null, "null");
                PageCrawler.contentProblematicUrls.incrementAndGet();
                return true;
            }
            if (PLAIN_DOMAIN_FILTER.matcher(str4).matches()) {
                if (!z) {
                    return true;
                }
                logger.debug("Url-\"" + str3 + "\": Discarded after matching to a url having only the domain part!");
                UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a url having only the domain part!", null, true, "true", "N/A", "false", "false", "false", null, "null");
                return true;
            }
            if (URL_FILE_EXTENSION_FILTER.matcher(str4).matches()) {
                if (!z) {
                    return true;
                }
                logger.debug("Url-\"" + str3 + "\": Discarded after matching to a url having an irrelevant extension!");
                UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a url having an irrelevant extension!", null, true, "true", "N/A", "false", "false", "false", null, "null");
                return true;
            }
            if (!CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER.matcher(str4).matches()) {
                return false;
            }
            if (!z) {
                return true;
            }
            logger.debug("Url-\"" + str3 + "\": Discarded after matching to a url having an unsupported document extension!");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a url having an unsupported document extension!", null, true, "true", "N/A", "false", "false", "false", null, "null");
            return true;
        }
        if (!z) {
            return true;
        }
        String group4 = matcher2.group(1);
        if (group4 == null || group4.isEmpty()) {
            String group5 = matcher2.group(2);
            if (group5 == null || group5.isEmpty()) {
                String group6 = matcher2.group(3);
                if (group6 == null || group6.isEmpty()) {
                    String group7 = matcher2.group(4);
                    if (group7 == null || group7.isEmpty()) {
                        String group8 = matcher2.group(5);
                        if (group8 == null || group8.isEmpty()) {
                            String group9 = matcher2.group(6);
                            if (group9 == null || group9.isEmpty()) {
                                String group10 = matcher2.group(7);
                                if (group10 == null || group10.isEmpty()) {
                                    str5 = "Discarded after matching to a domain with problems.";
                                    UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str5, null, true, "true", "N/A", "false", "false", "false", null, "null");
                                } else {
                                    str5 = "Discarded after matching to known domains with connectivity problems: '" + group10 + "'.";
                                    UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str5, null, true, "true", "N/A", "false", "false", "false", null, "null");
                                    LoaderAndChecker.connProblematicUrls.incrementAndGet();
                                }
                            } else {
                                str5 = "Discarded after matching to a site having its DocUrls in larger depth: '" + group9 + "'.";
                                UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str5, null, true, "true", "N/A", "false", "false", "false", null, "null");
                                pagesWithLargerCrawlingDepth.incrementAndGet();
                            }
                        } else {
                            str5 = "Discarded after matching to a domain which needs login to access docFiles: '" + group8 + "'.";
                            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str5, null, true, "true", "N/A", "false", "false", "false", null, "null");
                            pagesRequireLoginToAccessDocFiles.incrementAndGet();
                        }
                    } else {
                        str5 = "Discarded after matching to a domain which doesn't provide docUrls: '" + group7 + "'.";
                        UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str5, null, true, "true", "N/A", "false", "false", "false", null, "null");
                        pagesNotProvidingDocUrls.incrementAndGet();
                    }
                } else {
                    str5 = "Discarded after matching to a site containing the full-text as plain-text inside its HTML: '" + group6 + "'.";
                    UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str5, null, true, "true", "N/A", "false", "false", "false", null, "null");
                    pagesWithHtmlDocUrls.incrementAndGet();
                }
            } else {
                str5 = "Discarded after matching to the Results-directory: 'doaj.org/toc/': '" + group5 + "'.";
                UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str5, null, true, "true", "N/A", "false", "false", "false", null, "null");
                doajResultPageUrls.incrementAndGet();
            }
        } else {
            str5 = "Discarded after matching to a JavaScript-using domain, other than the 'sciencedirect.com': '" + group4 + "'.";
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, str5, null, true, "true", "N/A", "false", "false", "false", null, "null");
            javascriptPageUrls.incrementAndGet();
        }
        logger.debug("Url-\"" + str3 + "\": " + str5);
        return true;
    }

    public static boolean shouldNotAcceptInternalLink(String str, String str2) {
        if (str2 == null) {
            str2 = str.toLowerCase();
        }
        return shouldNotAcceptPageUrl(null, null, str, str2, false) || INTERNAL_LINKS_KEYWORDS_FILTER.matcher(str2).matches() || INTERNAL_LINKS_FILE_FORMAT_FILTER.matcher(str2).matches() || PLAIN_PAGE_EXTENSION_FILTER.matcher(str2).matches();
    }
}
