package org.archive.modules.extractor;

import com.lowagie.text.html.HtmlTags;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.exception.NestableRuntimeException;
import org.archive.modules.CrawlURI;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;

/* loaded from: input_file:WEB-INF/lib/heritrix-modules-3.1.0.jar:org/archive/modules/extractor/ExtractorJS.class */
public class ExtractorJS extends ContentExtractor {
    private static final long serialVersionUID = 2;
    static final String JAVASCRIPT_STRING_EXTRACTOR = "(\\\\{0,8}+(?:\"|'))(\\S{0,2083}?)(?:\\1)";
    protected long numberOfCURIsHandled = 0;
    private static Logger LOGGER = Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
    protected static long numberOfLinksExtracted = 0;
    protected static final String[] EXTRACTOR_URI_EXCEPTIONS = {"http://www.google-analytics.com/urchin.js"};

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean shouldExtract(CrawlURI crawlURI) {
        for (String str : EXTRACTOR_URI_EXCEPTIONS) {
            if (crawlURI.toString().equals(str)) {
                return false;
            }
        }
        String contentType = crawlURI.getContentType();
        if (contentType == null) {
            return false;
        }
        if (contentType.indexOf("javascript") >= 0 || contentType.indexOf("jscript") >= 0 || contentType.indexOf("ecmascript") >= 0 || crawlURI.toString().toLowerCase().endsWith(".js")) {
            return true;
        }
        LinkContext viaContext = crawlURI.getViaContext();
        if (viaContext == null) {
            return false;
        }
        return viaContext.toString().toLowerCase().startsWith(HtmlTags.SCRIPT);
    }

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean innerExtract(CrawlURI crawlURI) {
        this.numberOfCURIsHandled++;
        try {
            try {
                numberOfLinksExtracted += considerStrings(this, crawlURI, crawlURI.getRecorder().getContentReplayCharSequence(), true);
                return true;
            } catch (StackOverflowError e) {
                DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
                return true;
            }
        } catch (IOException e2) {
            crawlURI.getNonFatalFailures().add(e2);
            return false;
        }
    }

    public static long considerStrings(Extractor extractor, CrawlURI crawlURI, CharSequence charSequence, boolean z) {
        long j = 0;
        Matcher matcher = TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, charSequence);
        for (int i = 0; matcher.find(i); i = matcher.end(2)) {
            CharSequence subSequence = charSequence.subSequence(matcher.start(2), matcher.end(2));
            if (UriUtils.isLikelyUri(subSequence)) {
                String obj = subSequence.toString();
                try {
                    obj = StringEscapeUtils.unescapeJavaScript(obj);
                } catch (NestableRuntimeException e) {
                    LOGGER.log(Level.WARNING, "problem unescaping some javascript", (Throwable) e);
                }
                String speculativeFixup = UriUtils.speculativeFixup(obj, crawlURI.getUURI());
                j++;
                try {
                    int maxOutlinks = extractor.getExtractorParameters().getMaxOutlinks();
                    if (z) {
                        Link.addRelativeToVia(crawlURI, maxOutlinks, speculativeFixup, LinkContext.JS_MISC, Hop.SPECULATIVE);
                    } else {
                        Link.addRelativeToBase(crawlURI, maxOutlinks, speculativeFixup, LinkContext.JS_MISC, Hop.SPECULATIVE);
                    }
                } catch (URIException e2) {
                    extractor.logUriError(e2, crawlURI.getUURI(), speculativeFixup);
                }
            } else {
                j += considerStrings(extractor, crawlURI, subSequence, z);
            }
        }
        TextUtils.recycleMatcher(matcher);
        return j;
    }
}
