package eu.dnetlib.data.collector.plugins.oai.engine;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:eu/dnetlib/data/collector/plugins/oai/engine/XmlCleaner.class */
public class XmlCleaner {
    private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
    private static Pattern invalidControlCharPattern = Pattern.compile("&#11;");
    private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n -\ud7ff\ue000-�]");
    private static Set<String> goodEntities = new HashSet();
    private static Map<String, String> badEntities = new HashMap();

    public static String cleanAllEntities(String str) {
        if (str == null) {
            return null;
        }
        String replaceAll = invalidCharacterPattern.matcher(invalidControlCharPattern.matcher(str).replaceAll("")).replaceAll("");
        int i = 0;
        while (true) {
            int indexOf = replaceAll.indexOf(38, i);
            if (indexOf == -1) {
                if (Pattern.compile("<<").matcher(replaceAll).find()) {
                    replaceAll = replaceAll.replaceAll("<<", "&lt;&lt;");
                }
                if (Pattern.compile(">>").matcher(replaceAll).find()) {
                    replaceAll = replaceAll.replaceAll(">>", "&gt;&gt;");
                }
                return replaceAll;
            }
            if (!validCharacterEntityPattern.matcher(replaceAll.substring(indexOf)).find()) {
                for (int i2 = indexOf + 1; i2 < replaceAll.length(); i2++) {
                    char charAt = replaceAll.charAt(i2);
                    if (charAt == ';') {
                        replaceAll = replaceAll.substring(0, indexOf) + handleEntity(replaceAll.substring(indexOf, i2 + 1)) + replaceAll.substring(i2 + 1);
                    } else if (!Character.isLetterOrDigit(charAt)) {
                        replaceAll = replaceAll.substring(0, indexOf) + "&amp;" + replaceAll.substring(indexOf + 1);
                        indexOf = i2 + 4;
                    }
                    i = indexOf + 1;
                }
                return replaceAll.substring(0, indexOf) + "&amp;" + replaceAll.substring(indexOf + 1);
            }
            i = replaceAll.indexOf(59, indexOf) + 1;
        }
    }

    private static String handleEntity(String str) {
        if (goodEntities.contains(str)) {
            return str;
        }
        String str2 = badEntities.get(str);
        return (str2 == null && str2 == null) ? "" : str2;
    }

    static {
        goodEntities.add("&quot;");
        goodEntities.add("&amp;");
        goodEntities.add("&lt;");
        goodEntities.add("&gt;");
        badEntities.put("&euro;", "€");
        badEntities.put("&lsquo;", "‘");
        badEntities.put("&rsquo;", "’");
        badEntities.put("&nbsp;", " ");
        badEntities.put("&iexcl;", "¡");
        badEntities.put("&cent;", "¢");
        badEntities.put("&pound;", "£");
        badEntities.put("&curren;", "¤");
        badEntities.put("&yen;", "¥");
        badEntities.put("&brvbar;", "¦");
        badEntities.put("&sect;", "§");
        badEntities.put("&uml;", "¨");
        badEntities.put("&copy;", "©");
        badEntities.put("&ordf;", "ª");
        badEntities.put("&laquo;", "«");
        badEntities.put("&not;", "¬");
        badEntities.put("&shy;", "\u00ad");
        badEntities.put("&reg;", "®");
        badEntities.put("&macr;", "¯");
        badEntities.put("&deg;", "°");
        badEntities.put("&plusmn;", "±");
        badEntities.put("&sup2;", "²");
        badEntities.put("&sup3;", "³");
        badEntities.put("&acute;", "´");
        badEntities.put("&micro;", "µ");
        badEntities.put("&para;", "¶");
        badEntities.put("&middot;", "·");
        badEntities.put("&cedil;", "¸");
        badEntities.put("&sup1;", "¹");
        badEntities.put("&ordm;", "º");
        badEntities.put("&raquo;", "»");
        badEntities.put("&frac14;", "¼");
        badEntities.put("&frac12;", "½");
        badEntities.put("&frac34;", "¾");
        badEntities.put("&iquest;", "¿");
        badEntities.put("&Agrave;", "À");
        badEntities.put("&Aacute;", "Á");
        badEntities.put("&Acirc;", "Â");
        badEntities.put("&Atilde;", "Ã");
        badEntities.put("&Auml;", "Ä");
        badEntities.put("&Aring;", "Å");
        badEntities.put("&AElig;", "Æ");
        badEntities.put("&Ccedil;", "Ç");
        badEntities.put("&Egrave;", "È");
        badEntities.put("&Eacute;", "É");
        badEntities.put("&Ecirc;", "Ê");
        badEntities.put("&Euml;", "Ë");
        badEntities.put("&Igrave;", "Ì");
        badEntities.put("&Iacute;", "Í");
        badEntities.put("&Icirc;", "Î");
        badEntities.put("&Iuml;", "Ï");
        badEntities.put("&ETH;", "Ð");
        badEntities.put("&Ntilde;", "Ñ");
        badEntities.put("&Ograve;", "Ò");
        badEntities.put("&Oacute;", "Ó");
        badEntities.put("&Ocirc;", "Ô");
        badEntities.put("&Otilde;", "Õ");
        badEntities.put("&Ouml;", "Ö");
        badEntities.put("&times;", "×");
        badEntities.put("&Oslash;", "Ø");
        badEntities.put("&Ugrave;", "Ù");
        badEntities.put("&Uacute;", "Ú");
        badEntities.put("&Ucirc;", "Û");
        badEntities.put("&Uuml;", "Ü");
        badEntities.put("&Yacute;", "Ý");
        badEntities.put("&THORN;", "Þ");
        badEntities.put("&szlig;", "ß");
        badEntities.put("&agrave;", "à");
        badEntities.put("&aacute;", "á");
        badEntities.put("&acirc;", "â");
        badEntities.put("&atilde;", "ã");
        badEntities.put("&auml;", "ä");
        badEntities.put("&aring;", "å");
        badEntities.put("&aelig;", "æ");
        badEntities.put("&ccedil;", "ç");
        badEntities.put("&egrave;", "è");
        badEntities.put("&eacute;", "é");
        badEntities.put("&ecirc;", "ê");
        badEntities.put("&euml;", "ë");
        badEntities.put("&igrave;", "ì");
        badEntities.put("&iacute;", "í");
        badEntities.put("&icirc;", "î");
        badEntities.put("&iuml;", "ï");
        badEntities.put("&eth;", "ð");
        badEntities.put("&ntilde;", "ñ");
        badEntities.put("&ograve;", "ò");
        badEntities.put("&oacute;", "ó");
        badEntities.put("&ocirc;", "ô");
        badEntities.put("&otilde;", "õ");
        badEntities.put("&ouml;", "ö");
        badEntities.put("&divide;", "÷");
        badEntities.put("&oslash;", "ø");
        badEntities.put("&ugrave;", "ù");
        badEntities.put("&uacute;", "ú");
        badEntities.put("&ucirc;", "û");
        badEntities.put("&uuml;", "ü");
        badEntities.put("&yacute;", "ý");
        badEntities.put("&thorn;", "þ");
        badEntities.put("&yuml;", "ÿ");
    }
}
