/*
 * Decompiled with CFR 0.152.
 */
package eu.openaire.publications_retriever.crawler;

import eu.openaire.publications_retriever.crawler.MetadataHandler;
import eu.openaire.publications_retriever.crawler.SpecialUrlsHandler;
import eu.openaire.publications_retriever.exceptions.ConnTimeoutException;
import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DocLinkInvalidException;
import eu.openaire.publications_retriever.exceptions.DocLinkUnavailableException;
import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
import eu.openaire.publications_retriever.exceptions.DomainWithUnsupportedHEADmethodException;
import eu.openaire.publications_retriever.exceptions.DynamicInternalLinksFoundException;
import eu.openaire.publications_retriever.machine_learning.PageStructureMLA;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileData;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.io.BufferedReader;
import java.net.HttpURLConnection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.Strings;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PageCrawler {
    private static final Logger logger = LoggerFactory.getLogger(PageCrawler.class);
    private static final Pattern INTERNAL_LINKS_STARTING_FROM_FILTER = Pattern.compile("^(?:(?:mailto|tel|fax|file|data|whatsapp|visible|click|text|attr):|\\{openurl}|[/]*\\?(?:locale(?:-attribute)?|ln)=).*");
    public static final Pattern JAVASCRIPT_DOC_LINK = Pattern.compile("javascript:pdflink.*'(http.+)'[\\s]*,.*", 2);
    public static final Pattern JAVASCRIPT_CODE_PDF_LINK = Pattern.compile(".*\"pdfUrl\":\"([^\"]+)\".*");
    public static final ConcurrentHashMap<String, Integer> timesDomainNotGivingInternalLinks = new ConcurrentHashMap();
    public static final ConcurrentHashMap<String, Integer> timesDomainNotGivingDocUrls = new ConcurrentHashMap();
    public static final int timesToGiveNoInternalLinksBeforeBlocked = 200;
    public static final int timesToGiveNoDocUrlsBeforeBlocked = 100;
    public static AtomicInteger contentProblematicUrls = new AtomicInteger(0);
    private static final int MAX_INTERNAL_LINKS_TO_ACCEPT_PAGE = 500;
    private static final int MAX_POSSIBLE_DOC_OR_DATASET_LINKS_TO_CONNECT = 5;
    public static boolean should_check_remaining_links = true;
    private static final int MAX_REMAINING_INTERNAL_LINKS_TO_CONNECT = 10;
    public static final String spaceOrDashes = "(?:\\s|%20|-|_)*";
    public static final Pattern DOCUMENT_TEXT = Pattern.compile("pdf|full(?:\\s|%20|-|_)*text|file|attachment|download|t[\u00e9e]l[\u00e9e]charger|descargar|texte(?:\\s|%20|-|_)*int[\u00e9e]gral");
    public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|(?:\\s|%20|-|_)*)?gu[i\u00ed](?:de|a)|directive[s]?|(?<!readonly_)preview|leaflet|agreement(?!.*thesis(?:\\s|%20|-|_)*(?:19|20)[\\d]{2}.*)|accessibility|journal(?:\\s|%20|-|_)*catalog|disclose(?:\\s|%20|-|_)*file|poli(?:c(?:y|ies)(?!.*paper)|tika(?:si)?)|licen(?:se|cia)(?:\\s|%20|-|_)*(?:of|de)(?:\\s|%20|-|_)*us[eo]|(?:governance|safety)(?:\\s|%20|-|_)*statement|normativa|(?:consumer|hazard|copyright)(?:\\s|%20|-|_)*(?:information|(?:release(?:\\s|%20|-|_)*)?form)|copyright|permission|(?:editorial|review)(?:\\s|%20|-|_)*board|d[\u00e9e](?:p(?:\u00f4t[s]?|oser|osit(?!ed|s))|butez)|cr[\u00e9e]er(?:\\s|%20|-|_)*(?:votre|son)|orcid|subscription|instruction|code(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*conduct|[^_]request|join[^t]|compte|[^_]account|table(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*contents|(?:front|back|end)(?:\\s|%20|-|_)*matter|information(?:\\s|%20|-|_)*for(?:\\s|%20|-|_)*authors|pdf(?:/a)?(?:\\s|%20|-|_)*conversion|catalogue|factsheet|classifieds|pdf-viewer|certificate(?:\\s|%20|-|_)*of|conflict[s]?(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*interest|(?:recommendation|order)(?:\\s|%20|-|_)*form|adverti[sz]e|mandatory(?:\\s|%20|-|_)*open(?:\\s|%20|-|_)*access|recommandations(?:\\s|%20|-|_)*pour(?:\\s|%20|-|_)*s'affilier|hal.*collections|terms|conditions|hakuohjeet|logigramme|export_liste_publi|yearbook|pubs_(?:brochure|overview)|thermal-letter|r\u00e9utiliser(?:\\s|%20|-|_)*des(?:\\s|%20|-|_)*images(?:\\s|%20|-|_)*dans(?:\\s|%20|-|_)*des(?:\\s|%20|-|_)*publications|procedure|\u898f\u7a0b|\u904b\u55b6\u898f\u7a0b|(?:peer|mini)(?:\\s|%20|-|_)*review|(?:case|annual)(?:\\s|%20|-|_)*report|review(?:\\s|%20|-|_)*article|short(?:\\s|%20|-|_)*communication|letter(?:\\s|%20|-|_)*to(?:\\s|%20|-|_)*editor|how(?:\\s|%20|-|_)*to(?:\\s|%20|-|_)*(?:create|submit|contact)|tutori[ae]l|survey-results|calendar(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*events|know(?:\\s|%20|-|_)*your(?:\\s|%20|-|_)*rights|your(?:(?:\\s|%20|-|_)*id|cv)(?:\\s|%20|-|_)*hal|pr\u00e9sentation(?:\\s|%20|-|_)*portail(?:\\s|%20|-|_)*hal|data-sharing-guidance|rate(?:(?:\\s|%20|-|_)*)?cards|press(?:\\s|%20|-|_)*release|liability(?:\\s|%20|-|_)*disclaimer|(?:avec|dans)(?:\\s|%20|-|_)*(?:ocd|x2)?hal|online(?:\\s|%20|-|_)*flyer|publishing(?:\\s|%20|-|_)*process|book(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*abstracts|academic(?:\\s|%20|-|_)*social(?:\\s|%20|-|_)*networks|ijcseugcjournalno|manuscript(?:(?:\\s|%20|-|_)*preparation)?(?:\\s|%20|-|_)*checklist|by(?:\\s|%20|-|_)*laws|reglamento(?:\\s|%20|-|_)*de(?:\\s|%20|-|_)*ciencia(?:\\s|%20|-|_)*abierta|^(?:licen[cs]e|help|reprints|pol[i\u00ed]ti[kc][sa](?:(?:\\s|%20|-|_)*de(?:\\s|%20|-|_)*informa\u00e7\u00e3o)?|for(?:\\s|%20|-|_)*recruiters|charte(?:\\s|%20|-|_)*de(?:\\s|%20|-|_)*signature|weekly(?:\\s|%20|-|_)*visitors|publication(?:\\s|%20|-|_)*(?:ethics(?:\\s|%20|-|_)*and(?:\\s|%20|-|_)*malpractice|fees)|redaktion|sample(?:\\s|%20|-|_)*manuscript|open(?:\\s|%20|-|_)*access(?:(?:\\s|%20|-|_)*policy)?)$|/(?:entry|information|opinion|(?:rapportannuel|publerkl|utt_so_|atsc_|tjg_|ictrp_|oproep_voor_artikels_|[^/]*call_for_contributions_)[\\w_()-]*|accesorestringido|library_recommendation_form|research-article|loi_republique_numerique_publis|nutzungsbedingungen|autorenhinweise|mediadaten|canceledpresentations|sscc-facme_cirugia|bir_journals_reprint_form|transparencia|wfme|evolution_de_l_ergonomie|que_pouvez_vous_deposer|ethic-comittee-approval|restri(?:ngido|cted)|ofi[c]+ial|asn(?:\\s|%20|-|_)*tips|aidehelp|.*_doi|(?:b-ent|aces)_.*).pdf(?:\\?.*)?$|kilavuzu|(?:\u516c\u8868|\u767b\u9332)\u5c4a\u51fa\u66f8|\u53d6\u6271\u8981\u9818|\u30ea\u30dd\u30b8\u30c8\u30ea(?:\u8981\u9805|\u904b\u7528\u6307\u91dd)|\u691c\u7d22\u306e\u30dd\u30a4\u30f3\u30c8|\u306b\u3064\u3044\u3066|\u95b2\u89a7\u65b9\u6cd5|\u30fc\u30d7\u30f3\u30a2\u30af\u30bb\u30b9\u30dd\u30ea\u30b7\u30fc|\u3055\u308c\u305f\u307f\u306a\u3055\u307e\u3078|(?:\u8ad6\u6587\u306e|\u767b\u9332)\u8a31\u8afe\u66f8|\u8457\u4f5c\u6a29\u5229\u7528\u8a31\u8afe\u8981\u4ef6|\u524a\u9664\u4f9d\u983c\u66f8|\u30fc\u30d7\u30f3\u30a2\u30af\u30bb\u30b9\u65b9\u91dd|(?:\u520a\u884c\u7269|\u500b\u4eba)\u5358\u4f4d\u767b\u9332).*");
    private static final String commonPattern = "website-navigation|reference|su[m]{1,2}ar(?:io|y)(?!.*metadata.*)|author|logo|related(?:\\s|%20|-|_)*product";
    private static final Pattern PARENT_CLASS_NAME_FILTER_PATTERN = Pattern.compile("(?:^(?:tab|product-head-bnrs)$|.*(?:website-navigation|reference|su[m]{1,2}ar(?:io|y)(?!.*metadata.*)|author|logo|related(?:\\s|%20|-|_)*product|breadcrumb|su[b]?scri(?:p[tc]i[o\u00f3]n|b(?:a|ir)se)|reco[m]{1,2}enda(?:tion|do)|metric|stats|cookie|kapak|accesos-usuario).*)");
    private static final Pattern PARENT_ID_FILTER_PATTERN = Pattern.compile(".*(?:website-navigation|reference|su[m]{1,2}ar(?:io|y)(?!.*metadata.*)|author|logo|related(?:\\s|%20|-|_)*product|other).*");
    public static final int timesToCheckInternalLinksBeforeEvaluate = 20;
    public static final AtomicInteger timesCheckedRemainingLinks = new AtomicInteger(0);
    public static final AtomicInteger timesFoundDocOrDatasetUrlFromRemainingLinks = new AtomicInteger(0);
    private static final double leastPercentageOfHitsFromRemainingLinks = 0.2;

    public static void visit(String urlId, String sourceUrl, String pageUrl, String pageContentType, HttpURLConnection conn, String firstHTMLlineFromDetectedContentType, BufferedReader bufferedReader) {
        logger.debug("Visiting pageUrl: \"" + pageUrl + "\", from id: \"" + urlId + "\".");
        Matcher urlMatcher = UrlUtils.getUrlMatcher(pageUrl);
        if (urlMatcher == null) {
            logger.warn("Problematic URL in \"PageCrawler.visit()\": \"" + pageUrl + "\"");
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in PageCrawler.visit() method, after the occurrence of a urlMatcher error.", "null", null, true, "true", "false", "false", "false", "false", null, "null", "null");
            LoaderAndChecker.connProblematicUrls.incrementAndGet();
            ConnSupportUtils.closeBufferedReader(bufferedReader);
            return;
        }
        String pageDomain = UrlUtils.getDomainStr(pageUrl, urlMatcher);
        if (pageDomain == null) {
            logger.warn("Problematic URL in \"PageCrawler.visit()\": \"" + pageUrl + "\"");
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in PageCrawler.visit() method, after the occurrence of a domain-retrieval error.", "null", null, true, "true", "false", "false", "false", "false", null, "null", "null");
            LoaderAndChecker.connProblematicUrls.incrementAndGet();
            ConnSupportUtils.closeBufferedReader(bufferedReader);
            return;
        }
        if (ArgsUtils.shouldJustDownloadHtmlFiles) {
            ConnSupportUtils.closeBufferedReader(bufferedReader);
            FileData htmlFileData = ConnSupportUtils.downloadHtmlFile(conn, urlId, pageUrl, false, urlMatcher, firstHTMLlineFromDetectedContentType);
            if (htmlFileData == null) {
                logger.warn("Could not retrieve the HTML-code for pageUrl: " + pageUrl);
                UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its HTML-code. Its contentType is: '" + pageContentType + "'.", "null", null, true, "true", "true", "false", "false", "true", null, "null", "null");
                LoaderAndChecker.connProblematicUrls.incrementAndGet();
            } else {
                UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "null", "null", htmlFileData.getLocation(), null, true, "true", "true", "null", "null", "true", htmlFileData.getSize(), htmlFileData.getHash(), "text/html");
            }
            return;
        }
        String pageHtml = null;
        pageHtml = ConnSupportUtils.getHtmlString(conn, pageUrl, bufferedReader, false, firstHTMLlineFromDetectedContentType);
        if (pageHtml == null) {
            logger.warn("Could not retrieve the HTML-code for pageUrl: " + pageUrl);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its HTML-code. Its contentType is: '" + pageContentType + "'.", "null", null, true, "true", "true", "false", "false", "true", null, "null", "null");
            LoaderAndChecker.connProblematicUrls.incrementAndGet();
            return;
        }
        if (ArgsUtils.retrieveDocuments && pageDomain.contains("turkjgastroenterol.org")) {
            SpecialUrlsHandler.extractAndCheckTurkjgastroenterolDocUrl(pageHtml, urlId, sourceUrl, pageUrl, pageDomain);
            return;
        }
        boolean shouldContinueSearchingForDatasets = ArgsUtils.retrieveDatasets && ArgsUtils.shouldDetectAllDatasetLinks;
        boolean atLeastOneDocOrDatasetLinkFound = false;
        if (MetadataHandler.checkAndHandleMetadata(urlId, sourceUrl, pageUrl, pageDomain, pageHtml)) {
            if (!shouldContinueSearchingForDatasets) {
                return;
            }
            atLeastOneDocOrDatasetLinkFound = true;
        }
        HashMap<String, String> pageLinksWithStructure = null;
        pageLinksWithStructure = PageCrawler.retrieveInternalLinks(urlId, sourceUrl, pageUrl, pageDomain, pageHtml, pageContentType);
        if (pageLinksWithStructure == null) {
            return;
        }
        String urlToCheck = null;
        boolean shouldRunPrediction = false;
        HashMap<String, String> remainingLinks = new HashMap<String, String>(pageLinksWithStructure.size());
        String lowerCaseLink = null;
        int possibleDocOrDatasetUrlsCounter = 0;
        for (Map.Entry<String, String> currentEntry : pageLinksWithStructure.entrySet()) {
            IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple;
            String currentLink = currentEntry.getKey();
            if (!shouldRunPrediction) {
                urlToCheck = ConnSupportUtils.getFullyFormedUrl(pageUrl, currentLink, null);
                if (urlToCheck == null || (urlToCheck = LoaderAndChecker.basicURLNormalizer.filter(urlToCheck)) == null) {
                    logger.warn("Could not normalize internal url: " + currentLink);
                    continue;
                }
            } else {
                urlToCheck = currentLink;
            }
            if ((originalIdUrlMimeTypeTriple = UrlUtils.resultUrlsWithIDs.get(urlToCheck)) != null) {
                if (shouldContinueSearchingForDatasets && urlId.equals(originalIdUrlMimeTypeTriple.id) && sourceUrl.equals(originalIdUrlMimeTypeTriple.url)) continue;
                ConnSupportUtils.handleReCrossedTargetUrl(urlId, sourceUrl, pageUrl, urlToCheck, originalIdUrlMimeTypeTriple, false);
                PageStructureMLA.addStructureOfDocUrlInMap(pageUrl, currentEntry.getValue());
                if (!shouldContinueSearchingForDatasets) {
                    return;
                }
                atLeastOneDocOrDatasetLinkFound = true;
                continue;
            }
            lowerCaseLink = urlToCheck.toLowerCase();
            if (ArgsUtils.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCaseLink).matches() || ArgsUtils.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCaseLink).matches()) {
                if (UrlUtils.duplicateUrls.contains(urlToCheck)) continue;
                if (UrlTypeChecker.shouldNotAcceptInternalLink(urlToCheck, lowerCaseLink)) {
                    UrlUtils.duplicateUrls.add(urlToCheck);
                    continue;
                }
                if (++possibleDocOrDatasetUrlsCounter > 5) {
                    logger.warn("The maximum limit (5) of possible doc or dataset links to be connected was reached for pageUrl: \"" + pageUrl + "\". The page was discarded.");
                    if (!atLeastOneDocOrDatasetLinkFound) {
                        PageCrawler.handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
                    }
                    return;
                }
                try {
                    if (HttpConnUtils.connectAndCheckMimeType(urlId, sourceUrl, pageUrl, urlToCheck, null, false, true)) {
                        PageStructureMLA.addStructureOfDocUrlInMap(pageUrl, currentEntry.getValue());
                        if (!shouldContinueSearchingForDatasets) {
                            return;
                        }
                        atLeastOneDocOrDatasetLinkFound = true;
                        continue;
                    }
                    UrlUtils.duplicateUrls.add(urlToCheck);
                    continue;
                }
                catch (RuntimeException re) {
                    UrlUtils.duplicateUrls.add(urlToCheck);
                    continue;
                }
                catch (DomainBlockedException dbe) {
                    String blockedDomain = dbe.getMessage();
                    if (blockedDomain == null || !blockedDomain.contains(pageDomain)) continue;
                    logger.warn("Page: \"" + pageUrl + "\" left \"PageCrawler.visit()\" after its domain was blocked.");
                    String couldRetry = LoaderAndChecker.COULD_RETRY_URLS.matcher(pageUrl).matches() ? "true" : "false";
                    UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Logged in 'PageCrawler.visit()' method, as its domain was blocked during crawling.", "null", null, true, "true", "true", "false", "false", couldRetry, null, "null", "null");
                    LoaderAndChecker.connProblematicUrls.incrementAndGet();
                    return;
                }
                catch (ConnTimeoutException cte) {
                    if (!urlToCheck.contains(pageDomain)) continue;
                    logger.warn("Page: \"" + pageUrl + "\" left \"PageCrawler.visit()\" after a potentialDocUrl caused a ConnTimeoutException.");
                    UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Logged in 'PageCrawler.visit()' method, as an internalLink of this page caused 'ConnTimeoutException'.", "null", null, true, "true", "true", "false", "false", "true", null, "null", "null");
                    LoaderAndChecker.connProblematicUrls.incrementAndGet();
                    return;
                }
                catch (Exception e) {
                    logger.error("Error when processing the url: " + urlToCheck, e);
                    continue;
                }
            }
            remainingLinks.put(urlToCheck, currentEntry.getValue());
        }
        if (should_check_remaining_links && !remainingLinks.isEmpty()) {
            PageCrawler.checkRemainingInternalLinks(urlId, sourceUrl, pageUrl, pageDomain, remainingLinks, atLeastOneDocOrDatasetLinkFound);
        } else if (!atLeastOneDocOrDatasetLinkFound) {
            PageCrawler.handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
        }
    }

    private static void handlePageWithNoDocOrDatasetUrls(String urlId, String sourceUrl, String pageUrl, String pageDomain, boolean hasWarningLogBeenShown, boolean isAlreadyLoggedToOutput) {
        if (!hasWarningLogBeenShown) {
            logger.warn("Page: \"" + pageUrl + "\" does not contain a " + ArgsUtils.targetUrlType + ".");
        }
        UrlTypeChecker.pagesNotProvidingDocUrls.incrementAndGet();
        if (!isAlreadyLoggedToOutput) {
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Logged in 'PageCrawler.visit()' method, as no " + ArgsUtils.targetUrlType + " was found inside.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
        }
        if (ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, timesDomainNotGivingDocUrls, pageDomain, 100, true)) {
            logger.warn("Domain: \"" + pageDomain + "\" was blocked after giving no " + ArgsUtils.targetUrlType + " more than 100 times.");
        }
    }

    public static HashMap<String, String> retrieveInternalLinks(String urlId, String sourceUrl, String pageUrl, String pageDomain, String pageHtml, String pageContentType) {
        HashMap<String, String> pageLinksWithStructure = null;
        try {
            pageLinksWithStructure = PageCrawler.extractInternalLinksFromHtml(urlId, pageHtml, pageUrl);
        }
        catch (RuntimeException re) {
            String exceptionMessage = re.getMessage();
            exceptionMessage = exceptionMessage == null ? "No reason was given!" : exceptionMessage;
            logger.warn(exceptionMessage + " This page was discarded.");
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.retrieveInternalLinks()' method, with reason: " + exceptionMessage, "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            contentProblematicUrls.incrementAndGet();
            return null;
        }
        catch (DynamicInternalLinksFoundException dilfe) {
            HttpConnUtils.blacklistedDomains.add(pageDomain);
            logger.warn("Page: \"" + pageUrl + "\" left \"PageCrawler.visit()\" after found to have dynamic links. Its domain \"" + pageDomain + "\"  was blocked.");
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Logged in 'PageCrawler.retrieveInternalLinks()', as it belongs to a domain with dynamic-links.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            contentProblematicUrls.incrementAndGet();
            return null;
        }
        catch (DocLinkFoundException dlfe) {
            if (!PageCrawler.verifyDocLink(urlId, sourceUrl, pageUrl, pageContentType, dlfe)) {
                PageCrawler.handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
            }
            return null;
        }
        catch (DocLinkInvalidException dlie) {
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.retrieveInternalLinks()' method, as there was an invalid docLink. Its contentType is: '" + pageContentType + "'", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            PageCrawler.handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
            return null;
        }
        catch (DocLinkUnavailableException dlue) {
            logger.warn("The docLink was not available inside pageUrl: " + pageUrl);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.retrieveInternalLinks()' method, as the doc-link was not available. Its contentType is: '" + pageContentType + "'", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            contentProblematicUrls.incrementAndGet();
            return null;
        }
        catch (Exception e) {
            logger.warn("Could not retrieve the internalLinks for pageUrl: " + pageUrl);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.retrieveInternalLinks()' method, as there was a problem retrieving its internalLinks. Its contentType is: '" + pageContentType + "'", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            contentProblematicUrls.incrementAndGet();
            return null;
        }
        boolean isNull = pageLinksWithStructure == null;
        boolean isEmpty = false;
        if (!isNull) {
            isEmpty = pageLinksWithStructure.isEmpty();
        }
        if (isNull || isEmpty) {
            logger.warn("No " + (isEmpty ? "valid" : "available") + " links were able to be retrieved from pageUrl: \"" + pageUrl + "\". Its contentType is: " + pageContentType);
            contentProblematicUrls.incrementAndGet();
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in PageCrawler.retrieveInternalLinks() method, as no " + (isEmpty ? "valid " : "") + "links were able to be retrieved from it. Its contentType is: '" + pageContentType + "'", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            if (ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, timesDomainNotGivingInternalLinks, pageDomain, 200, true)) {
                logger.warn("Domain: \"" + pageDomain + "\" was blocked after not providing internalLinks more than 200 times.");
            }
            return null;
        }
        return pageLinksWithStructure;
    }

    public static HashMap<String, String> extractInternalLinksFromHtml(String urlId, String pageHtml, String pageUrl) throws DocLinkFoundException, DynamicInternalLinksFoundException, DocLinkInvalidException, DocLinkUnavailableException, RuntimeException {
        Document document = Jsoup.parse(pageHtml);
        Elements elementLinksOnPage = document.select("a, link[href][type*=pdf], form[action]");
        if (elementLinksOnPage.isEmpty()) {
            return null;
        }
        HashMap<String, String> linksWithStructure = new HashMap<String, String>(elementLinksOnPage.size() / 2, 10.0f);
        int curNumOfInternalLinks = 0;
        if (pageUrl.contains("aup-online.com")) {
            SpecialUrlsHandler.handleAupOnlinePage(pageUrl, elementLinksOnPage);
        }
        PageStructureMLA.predictDocOrDatasetLink(pageUrl, elementLinksOnPage);
        for (Element el : elementLinksOnPage) {
            String internalLink;
            if (PageCrawler.hasUnacceptableStructure(el, pageUrl)) continue;
            if (ArgsUtils.retrieveDocuments) {
                String linkAttr = el.text().trim();
                if (!linkAttr.isEmpty() && PageCrawler.checkTextOrTitleAlongWithLink(el, linkAttr) || !(linkAttr = el.attr("title").trim()).isEmpty() && PageCrawler.checkTextOrTitleAlongWithLink(el, linkAttr)) continue;
                linkAttr = el.attr("type").trim();
                if (!linkAttr.isEmpty() && ConnSupportUtils.knownDocMimeTypes.contains(linkAttr)) {
                    String tempInternalLink;
                    internalLink = el.attr("href").trim();
                    if (internalLink.isEmpty() || internalLink.equals("#") || (tempInternalLink = ConnSupportUtils.getFullyFormedUrl(pageUrl, internalLink, null)) != null && UrlTypeChecker.shouldNotAcceptInternalLink(tempInternalLink, null)) {
                        throw new DocLinkInvalidException(internalLink);
                    }
                    internalLink = Strings.CS.replace(internalLink, "/view/", "/download/", 1);
                    throw new DocLinkFoundException(internalLink, PageStructureMLA.getPageTagAndClassStructureForElement(el), false);
                }
            }
            if ((internalLink = PageCrawler.getInternalLink(pageUrl, el)) == null || (internalLink = PageCrawler.checkAndGatherInternalLink(internalLink, el)) == null) continue;
            if (++curNumOfInternalLinks > 500) {
                throw new RuntimeException("Avoid checking more than 500 internal links which were found in pageUrl \"" + pageUrl + "\".");
            }
            linksWithStructure.put(internalLink, PageStructureMLA.getPageTagAndClassStructureForElement(el));
        }
        return linksWithStructure;
    }

    public static String getInternalLink(String pageUrl, Element el) throws DocLinkFoundException, DocLinkInvalidException {
        String internalLink = el.attr("href").trim();
        if ((internalLink.isEmpty() || internalLink.equals("#")) && (internalLink = PageCrawler.getInternalDataLink(el)) == null) {
            internalLink = el.attr("action").trim();
            if (internalLink.isEmpty() || internalLink.equals("#") || ArgsUtils.retrieveDocuments && !LoaderAndChecker.DOC_URL_FILTER.matcher(internalLink.toLowerCase()).matches() || ArgsUtils.retrieveDatasets && !LoaderAndChecker.DATASET_URL_FILTER.matcher(internalLink.toLowerCase()).matches()) {
                return null;
            }
            String tempInternalLink = ConnSupportUtils.getFullyFormedUrl(pageUrl, internalLink, null);
            if (tempInternalLink != null && UrlTypeChecker.shouldNotAcceptInternalLink(tempInternalLink, null)) {
                throw new DocLinkInvalidException(internalLink);
            }
            throw new DocLinkFoundException(internalLink, PageStructureMLA.getPageTagAndClassStructureForElement(el), false);
        }
        return internalLink;
    }

    private static boolean checkTextOrTitleAlongWithLink(Element el, String linkAttr) throws DocLinkFoundException, DocLinkInvalidException {
        String lowerCaseLinkAttr = linkAttr.toLowerCase();
        if (NON_VALID_DOCUMENT.matcher(lowerCaseLinkAttr).matches()) {
            return true;
        }
        if (DOCUMENT_TEXT.matcher(lowerCaseLinkAttr).matches()) {
            String internalLink = el.attr("href").trim();
            if ((internalLink.isEmpty() || internalLink.equals("#")) && (internalLink = PageCrawler.getInternalDataLink(el)) == null) {
                return true;
            }
            if (internalLink.startsWith("javascript:")) {
                return true;
            }
            if (!UrlTypeChecker.shouldNotAcceptInternalLink(internalLink, null)) {
                internalLink = Strings.CS.replace(internalLink, "/view/", "/download/", 1);
                throw new DocLinkFoundException(internalLink, PageStructureMLA.getPageTagAndClassStructureForElement(el), false);
            }
            throw new DocLinkInvalidException(internalLink);
        }
        return false;
    }

    private static String getInternalDataLink(Element element) {
        Attribute attribute;
        String name;
        String internalLink = null;
        List<Attribute> attributes = element.attributes().asList();
        Iterator<Attribute> iterator2 = attributes.iterator();
        while (iterator2.hasNext() && (!(name = (attribute = iterator2.next()).getKey()).contains("data") || name.contains("data-follow-set") || (internalLink = attribute.getValue().trim()).isEmpty() || internalLink.equals("#"))) {
        }
        return internalLink;
    }

    private static boolean hasUnacceptableStructure(Element element, String pageUrl) {
        String linkDomain;
        String internalLink;
        if (element.className().trim().equals("state-published") && (internalLink = element.attr("href").trim()).startsWith("http", 0) && (linkDomain = UrlUtils.getDomainStr(internalLink, null)) != null && !pageUrl.contains(linkDomain)) {
            return true;
        }
        Element parentElement = element.parent();
        if (parentElement == null) {
            return false;
        }
        String parentLowerText = parentElement.ownText().trim().toLowerCase();
        if (!parentLowerText.isEmpty() && NON_VALID_DOCUMENT.matcher(parentLowerText).matches()) {
            return true;
        }
        do {
            String parentTag;
            if (!(parentTag = parentElement.tagName().trim()).isEmpty() && (parentTag.equals("footer") || parentTag.equals("header"))) {
                return true;
            }
            String parentClass = parentElement.className().trim();
            if (!parentClass.isEmpty() && PARENT_CLASS_NAME_FILTER_PATTERN.matcher(parentClass.toLowerCase()).matches()) {
                return true;
            }
            String parentId = parentElement.id();
            if (parentId.isEmpty() || !PARENT_ID_FILTER_PATTERN.matcher(parentId.toLowerCase()).matches()) continue;
            return true;
        } while ((parentElement = parentElement.parent()) != null);
        return false;
    }

    public static String checkAndGatherInternalLink(String internalLink, Element el) throws DynamicInternalLinksFoundException, DocLinkFoundException {
        if (internalLink.equals("/")) {
            return null;
        }
        if (internalLink.contains("{{") || internalLink.contains("<?")) {
            throw new DynamicInternalLinksFoundException();
        }
        String lowerCaseInternalLink = internalLink.toLowerCase();
        if (INTERNAL_LINKS_STARTING_FROM_FILTER.matcher(lowerCaseInternalLink).matches()) {
            return null;
        }
        if (lowerCaseInternalLink.contains("#")) {
            if (ArgsUtils.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCaseInternalLink).matches() || ArgsUtils.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCaseInternalLink).matches()) {
                internalLink = UrlUtils.removeAnchor(internalLink);
                return internalLink;
            }
            if (!lowerCaseInternalLink.contains("/#/")) {
                return null;
            }
        } else if (lowerCaseInternalLink.contains("\"") || lowerCaseInternalLink.contains("[error")) {
            return null;
        }
        if (lowerCaseInternalLink.startsWith("javascript:", 0)) {
            String pdfLink = null;
            Matcher pdfLinkMatcher = JAVASCRIPT_DOC_LINK.matcher(internalLink);
            if (!pdfLinkMatcher.matches()) {
                return null;
            }
            try {
                pdfLink = pdfLinkMatcher.group(1);
            }
            catch (Exception e) {
                logger.error("", e);
            }
            throw new DocLinkFoundException(pdfLink, PageStructureMLA.getPageTagAndClassStructureForElement(el), false);
        }
        return internalLink;
    }

    public static boolean verifyDocLink(String urlId, String sourceUrl, String pageUrl, String pageContentType, DocLinkFoundException dlfe) {
        String docLink = dlfe.getMessage();
        if (docLink == null || docLink.isEmpty()) {
            logger.warn("DocLink was not retrieved!");
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its internalLinks. Its contentType is: '" + pageContentType + "'", "null", null, true, "true", "true", "false", "false", "true", null, "null", "null");
            return false;
        }
        String tempLink = docLink;
        if ((docLink = ConnSupportUtils.getFullyFormedUrl(pageUrl, docLink, null)) == null || (docLink = LoaderAndChecker.basicURLNormalizer.filter(docLink)) == null) {
            logger.warn("Could not normalize internal url: " + tempLink);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as there were normalization problems with the 'possibleDocUrl' found inside: " + tempLink, "null", null, true, "true", "false", "false", "false", "false", null, "null", "null");
            return false;
        }
        IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.resultUrlsWithIDs.get(docLink);
        if (originalIdUrlMimeTypeTriple != null) {
            if (urlId.equals(originalIdUrlMimeTypeTriple.id) && sourceUrl.equals(originalIdUrlMimeTypeTriple.url)) {
                logger.warn("A second targetUrl was found for the same id (" + urlId + ") and sourceUrl(" + sourceUrl + ")!");
                return true;
            }
            ConnSupportUtils.handleReCrossedTargetUrl(urlId, sourceUrl, pageUrl, docLink, originalIdUrlMimeTypeTriple, false);
            PageStructureMLA.addStructureOfDocUrlInMap(pageUrl, dlfe.getPageTagAndClassStructureForElement());
            if (dlfe.isPredictedByStructureMLA()) {
                PageStructureMLA.structureValidatedDocLinks.incrementAndGet();
            }
            return true;
        }
        try {
            if (!HttpConnUtils.connectAndCheckMimeType(urlId, sourceUrl, pageUrl, docLink, null, false, true)) {
                logger.warn("The DocLink < " + docLink + " > was not a " + ArgsUtils.targetUrlType + " (unexpected)!");
                UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as the retrieved DocLink: < " + docLink + " > was not a docUrl.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
                return false;
            }
            PageStructureMLA.addStructureOfDocUrlInMap(pageUrl, dlfe.getPageTagAndClassStructureForElement());
            if (dlfe.isPredictedByStructureMLA()) {
                PageStructureMLA.structureValidatedDocLinks.incrementAndGet();
            }
            return true;
        }
        catch (Exception e) {
            logger.warn("The DocLink < " + docLink + " > was not reached!");
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as the retrieved DocLink: < " + docLink + " > had connectivity problems.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            return false;
        }
    }

    public static boolean checkRemainingInternalLinks(String urlId, String sourceUrl, String pageUrl, String pageDomain, HashMap<String, String> remainingLinks, boolean atLeastOneDocOrDatasetLinkFound) {
        double percentage;
        int temp_timesCheckedRemainingLinks = timesCheckedRemainingLinks.incrementAndGet();
        if (temp_timesCheckedRemainingLinks >= 20 && (percentage = (double)timesFoundDocOrDatasetUrlFromRemainingLinks.get() * 100.0 / (double)temp_timesCheckedRemainingLinks) < 0.2) {
            logger.warn("The percentage of found docUrls from the remaining links is too low ( " + percentage + "% ). Stop checking the remaining-internalLinks for any pageUrl..");
            should_check_remaining_links = false;
            if (!atLeastOneDocOrDatasetLinkFound) {
                PageCrawler.handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
            }
            return false;
        }
        int remainingUrlsCounter = 0;
        for (Map.Entry<String, String> currentEntry : remainingLinks.entrySet()) {
            String currentLink = currentEntry.getKey();
            if (!currentLink.contains(pageDomain) || UrlUtils.duplicateUrls.contains(currentLink)) continue;
            if (UrlTypeChecker.shouldNotAcceptInternalLink(currentLink, null)) {
                UrlUtils.duplicateUrls.add(currentLink);
                continue;
            }
            if (++remainingUrlsCounter > 10) {
                logger.warn("The maximum limit (10) of remaining links to be connected was reached for pageUrl: \"" + pageUrl + "\". The page was discarded.");
                if (!atLeastOneDocOrDatasetLinkFound) {
                    PageCrawler.handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
                }
                return false;
            }
            try {
                if (HttpConnUtils.connectAndCheckMimeType(urlId, sourceUrl, pageUrl, currentLink, null, false, false)) {
                    timesFoundDocOrDatasetUrlFromRemainingLinks.incrementAndGet();
                    PageStructureMLA.addStructureOfDocUrlInMap(pageUrl, currentEntry.getValue());
                    return true;
                }
                UrlUtils.duplicateUrls.add(currentLink);
            }
            catch (DomainBlockedException dbe) {
                String blockedDomain = dbe.getMessage();
                if (blockedDomain == null || !blockedDomain.contains(pageDomain)) continue;
                logger.warn("Page: \"" + pageUrl + "\" left \"PageCrawler.checkRemainingInternalLinks()\" after its domain was blocked.");
                String couldRetry = LoaderAndChecker.COULD_RETRY_URLS.matcher(pageUrl).matches() ? "true" : "false";
                UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Logged in 'PageCrawler.checkRemainingInternalLinks()' method, as its domain was blocked during crawling.", "null", null, true, "true", "true", "false", "false", couldRetry, null, "null", "null");
                LoaderAndChecker.connProblematicUrls.incrementAndGet();
                return false;
            }
            catch (DomainWithUnsupportedHEADmethodException dwuhe) {
                if (!currentLink.contains(pageDomain)) continue;
                logger.warn("Page: \"" + pageUrl + "\" left \"PageCrawler.checkRemainingInternalLinks()\" after its domain was caught to not support the HTTP HEAD method, as a result, the internal-links will stop being checked.");
                UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Logged in 'PageCrawler.checkRemainingInternalLinks()' method, as its domain was caught to not support the HTTP HEAD method.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
                LoaderAndChecker.connProblematicUrls.incrementAndGet();
                return false;
            }
            catch (ConnTimeoutException cte) {
                if (!currentLink.contains(pageDomain)) continue;
                logger.warn("Page: \"" + pageUrl + "\" left \"PageCrawler.checkRemainingInternalLinks()\" after an internalLink caused a ConnTimeoutException.");
                UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Logged in 'PageCrawler.checkRemainingInternalLinks()' method, as an internalLink of this page caused 'ConnTimeoutException'.", "null", null, true, "true", "true", "false", "false", "true", null, "null", "null");
                LoaderAndChecker.connProblematicUrls.incrementAndGet();
                return false;
            }
            catch (RuntimeException runtimeException) {
            }
        }
        if (!atLeastOneDocOrDatasetLinkFound) {
            PageCrawler.handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
        }
        return false;
    }

    public static void printInternalLinksForDebugging(HashSet<String> currentPageLinks) {
        for (String url : currentPageLinks) {
            logger.debug(url);
        }
    }
}

