package crawlercommons.sitemaps;

import crawlercommons.filters.URLFilter;
import crawlercommons.mimetypes.MimeTypeDetector;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.sax.DelegatorHandler;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/* loaded from: input_file:crawlercommons/sitemaps/SiteMapParser.class */
public class SiteMapParser {
    public static final Logger LOG = LoggerFactory.getLogger((Class<?>) SiteMapParser.class);
    private static final int MAX_URLS = 50000;
    public static final int MAX_BYTES_ALLOWED = 52428800;
    protected boolean strict;
    private boolean allowPartial;
    protected boolean strictNamespace;
    protected Set<String> acceptedNamespaces;
    protected Map<String, Extension> extensionNamespaces;
    private MimeTypeDetector mimeTypeDetector;
    private boolean allowDocTypeDefinitions;
    private Function<String, String> urlFilter;

    public SiteMapParser() {
        this(true, false);
    }

    public SiteMapParser(boolean z) {
        this(z, false);
    }

    public SiteMapParser(boolean z, boolean z2) {
        this.strict = true;
        this.allowPartial = false;
        this.strictNamespace = false;
        this.acceptedNamespaces = new HashSet();
        this.extensionNamespaces = new HashMap();
        this.allowDocTypeDefinitions = false;
        this.urlFilter = str -> {
            return str;
        };
        this.strict = z;
        this.allowPartial = z2;
        this.mimeTypeDetector = new MimeTypeDetector();
    }

    public void setAllowDocTypeDefinitions(boolean z) {
        this.allowDocTypeDefinitions = z;
    }

    public boolean isStrict() {
        return this.strict;
    }

    public boolean isStrictNamespace() {
        return this.strictNamespace;
    }

    public void setStrictNamespace(boolean z) {
        this.strictNamespace = z;
        if (this.strictNamespace) {
            addAcceptedNamespace(Namespace.SITEMAP);
        }
    }

    public void addAcceptedNamespace(String str) {
        this.acceptedNamespaces.add(str);
    }

    public void addAcceptedNamespace(String[] strArr) {
        for (String str : strArr) {
            this.acceptedNamespaces.add(str);
        }
    }

    public void enableExtension(Extension extension) {
        Iterator<String> it = Namespace.SITEMAP_EXTENSION_NAMESPACES.get(extension).iterator();
        while (it.hasNext()) {
            this.extensionNamespaces.put(it.next(), extension);
        }
    }

    public void enableExtensions() {
        for (Extension extension : Extension.values()) {
            Iterator<String> it = Namespace.SITEMAP_EXTENSION_NAMESPACES.get(extension).iterator();
            while (it.hasNext()) {
                this.extensionNamespaces.put(it.next(), extension);
            }
        }
    }

    public void setURLFilter(Function<String, String> function) {
        this.urlFilter = function;
    }

    public void setURLFilter(URLFilter uRLFilter) {
        Objects.requireNonNull(uRLFilter);
        this.urlFilter = uRLFilter::filter;
    }

    public AbstractSiteMap parseSiteMap(URL url) throws UnknownFormatException, IOException {
        if (url == null) {
            return null;
        }
        return parseSiteMap(IOUtils.toByteArray(url), url);
    }

    public AbstractSiteMap parseSiteMap(String str, byte[] bArr, AbstractSiteMap abstractSiteMap) throws UnknownFormatException, IOException {
        AbstractSiteMap parseSiteMap = parseSiteMap(str, bArr, abstractSiteMap.getUrl());
        parseSiteMap.setLastModified(abstractSiteMap.getLastModified());
        return parseSiteMap;
    }

    public AbstractSiteMap parseSiteMap(byte[] bArr, URL url) throws UnknownFormatException, IOException {
        if (url == null) {
            return null;
        }
        String detect = this.mimeTypeDetector.detect(bArr);
        if (detect == null) {
            throw new UnknownFormatException(String.format(Locale.ROOT, "Failed to detect MediaType of sitemap '%s'", url));
        }
        return parseSiteMap(detect, bArr, url);
    }

    public AbstractSiteMap parseSiteMap(String str, byte[] bArr, URL url) throws UnknownFormatException, IOException {
        String format;
        String normalize = this.mimeTypeDetector.normalize(str, bArr);
        if (this.mimeTypeDetector.isXml(normalize)) {
            return processXml(url, bArr);
        }
        if (this.mimeTypeDetector.isText(normalize)) {
            return processText(url, bArr);
        }
        if (this.mimeTypeDetector.isGzip(normalize)) {
            try {
                BufferedInputStream bufferedInputStream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(bArr)));
                try {
                    String detect = this.mimeTypeDetector.detect(bufferedInputStream);
                    if (this.mimeTypeDetector.isXml(detect)) {
                        AbstractSiteMap processGzippedXML = processGzippedXML(url, bArr);
                        bufferedInputStream.close();
                        return processGzippedXML;
                    }
                    if (this.mimeTypeDetector.isText(detect)) {
                        SiteMap processText = processText(url, bufferedInputStream);
                        bufferedInputStream.close();
                        return processText;
                    }
                    format = detect == null ? String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url) : String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' (embedded in %s) from '%s'", detect, str, url);
                    bufferedInputStream.close();
                } finally {
                }
            } catch (Exception e) {
                throw new UnknownFormatException(String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url), e);
            }
        } else {
            format = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", str, url);
        }
        throw new UnknownFormatException(format);
    }

    public void walkSiteMap(URL url, Consumer<SiteMapURL> consumer) throws UnknownFormatException, IOException {
        if (url == null || consumer == null) {
            LOG.debug("Got null sitemap URL and/or action, stopping traversal");
        } else {
            walkSiteMap(parseSiteMap(url), consumer);
        }
    }

    public void walkSiteMap(AbstractSiteMap abstractSiteMap, Consumer<SiteMapURL> consumer) throws UnknownFormatException, IOException {
        if (abstractSiteMap == null || consumer == null) {
            LOG.debug("Got null sitemap and/or action, stopping traversal");
            return;
        }
        if (abstractSiteMap.isIndex()) {
            for (AbstractSiteMap abstractSiteMap2 : ((SiteMapIndex) abstractSiteMap).getSitemaps()) {
                if (abstractSiteMap2 != null) {
                    walkSiteMap(abstractSiteMap2.getUrl(), consumer);
                }
            }
            return;
        }
        for (SiteMapURL siteMapURL : ((SiteMap) abstractSiteMap).getSiteMapUrls()) {
            if (siteMapURL != null) {
                consumer.accept(siteMapURL);
            }
        }
    }

    protected AbstractSiteMap processXml(URL url, byte[] bArr) throws UnknownFormatException {
        SkipLeadingWhiteSpaceInputStream skipLeadingWhiteSpaceInputStream = new SkipLeadingWhiteSpaceInputStream(new BOMInputStream(new ByteArrayInputStream(bArr)));
        InputSource inputSource = new InputSource();
        inputSource.setCharacterStream(new BufferedReader(new InputStreamReader(skipLeadingWhiteSpaceInputStream, StandardCharsets.UTF_8)));
        return processXml(url, inputSource);
    }

    protected SiteMap processText(URL url, byte[] bArr) throws IOException {
        return processText(url, new ByteArrayInputStream(bArr));
    }

    protected SiteMap processText(URL url, InputStream inputStream) throws IOException {
        LOG.debug("Processing textual Sitemap");
        SiteMap siteMap = new SiteMap(url);
        siteMap.setType(AbstractSiteMap.SitemapType.TEXT);
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new BOMInputStream(inputStream), StandardCharsets.UTF_8));
        int i = 0;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            i++;
            if (i > 50000) {
                break;
            }
            String trim = readLine.trim();
            if (!trim.isEmpty()) {
                String apply = this.urlFilter.apply(trim);
                if (apply == null) {
                    LOG.debug("Filtered url: [{}]", trim.substring(0, Math.min(1024, trim.length())));
                } else {
                    try {
                        URL url2 = new URL(apply);
                        boolean urlIsValid = urlIsValid(siteMap.getBaseUrl(), url2.toString());
                        if (urlIsValid || !this.strict) {
                            SiteMapURL siteMapURL = new SiteMapURL(url2, urlIsValid);
                            siteMap.addSiteMapUrl(siteMapURL);
                            LOG.debug("  {}. {}", Integer.valueOf(i), siteMapURL);
                        } else {
                            LOG.debug("URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", url2.toExternalForm(), siteMap.getBaseUrl());
                        }
                    } catch (MalformedURLException e) {
                        LOG.debug("Bad url: [{}]", trim.substring(0, Math.min(1024, trim.length())));
                    }
                }
            }
        }
        siteMap.setProcessed(true);
        return siteMap;
    }

    protected AbstractSiteMap processGzippedXML(URL url, byte[] bArr) throws IOException, UnknownFormatException {
        LOG.debug("Processing gzipped XML");
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bArr);
        String replaceFirst = url.toString().replaceFirst("\\.gz$", "");
        LOG.debug("XML url = {}", replaceFirst);
        InputSource inputSource = new InputSource(new SkipLeadingWhiteSpaceInputStream(new BOMInputStream(new GZIPInputStream(byteArrayInputStream))));
        inputSource.setSystemId(replaceFirst);
        return processXml(url, inputSource);
    }

    protected AbstractSiteMap processXml(URL url, InputSource inputSource) throws UnknownFormatException {
        SAXParserFactory newInstance = SAXParserFactory.newInstance();
        newInstance.setValidating(false);
        newInstance.setXIncludeAware(false);
        newInstance.setNamespaceAware(true);
        try {
            newInstance.setFeature("http://xml.org/sax/features/external-general-entities", false);
            newInstance.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
            newInstance.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
            if (!this.allowDocTypeDefinitions) {
                newInstance.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
            }
            DelegatorHandler delegatorHandler = new DelegatorHandler(url, this.strict);
            delegatorHandler.setStrictNamespace(isStrictNamespace());
            if (isStrictNamespace()) {
                delegatorHandler.setAcceptedNamespaces(this.acceptedNamespaces);
            }
            delegatorHandler.setExtensionNamespaces(this.extensionNamespaces);
            delegatorHandler.setURLFilter(this.urlFilter);
            try {
                SAXParser newSAXParser = newInstance.newSAXParser();
                newSAXParser.getXMLReader().setEntityResolver(new EntityResolver() { // from class: crawlercommons.sitemaps.SiteMapParser.1
                    @Override // org.xml.sax.EntityResolver
                    public InputSource resolveEntity(String str, String str2) {
                        return new InputSource(new StringReader(""));
                    }
                });
                newSAXParser.parse(inputSource, delegatorHandler);
                AbstractSiteMap siteMap = delegatorHandler.getSiteMap();
                if (siteMap != null) {
                    return siteMap;
                }
                UnknownFormatException exception = delegatorHandler.getException();
                if (exception != null) {
                    throw exception;
                }
                throw new UnknownFormatException("Unknown XML format for: " + url);
            } catch (IOException e) {
                LOG.warn("Error parsing sitemap {}: {}", url, e.getMessage());
                UnknownFormatException unknownFormatException = new UnknownFormatException("Failed to parse " + url);
                unknownFormatException.initCause(e);
                throw unknownFormatException;
            } catch (ParserConfigurationException e2) {
                throw new IllegalStateException(e2);
            } catch (SAXException e3) {
                LOG.warn("Error parsing sitemap {}: {}", url, e3.getMessage());
                AbstractSiteMap siteMap2 = delegatorHandler.getSiteMap();
                if (!this.allowPartial || siteMap2 == null) {
                    UnknownFormatException unknownFormatException2 = new UnknownFormatException("Failed to parse " + url);
                    unknownFormatException2.initCause(e3);
                    throw unknownFormatException2;
                }
                LOG.warn("Processed broken/partial sitemap for '" + url + "'");
                siteMap2.setProcessed(true);
                return siteMap2;
            }
        } catch (Exception e4) {
            throw new RuntimeException("Failed to configure XML parser: " + e4.toString());
        }
    }

    public static boolean urlIsValid(String str, String str2) {
        return str2.startsWith(str);
    }
}
