package eu.dnetlib.pace.common;

import com.google.common.base.Joiner;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.collect.UnmodifiableIterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
import java.text.Normalizer;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;

/* loaded from: input_file:BOOT-INF/lib/dnet-pace-core-2.5.1.jar:eu/dnetlib/pace/common/AbstractPaceFunctions.class */
public abstract class AbstractPaceFunctions {
    private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
    private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎";
    private static final String aliases_to = "0123456789+-=()n0123456789+-=()";
    protected static Set<String> stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
    protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
    protected static final FieldList EMPTY_FIELD = new FieldListImpl();

    /* JADX INFO: Access modifiers changed from: protected */
    public String concat(List<String> list) {
        return Joiner.on(" ").skipNulls().join(list);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String cleanup(String str) {
        return fixAliases(nfd(str)).replaceAll("&ndash;", " ").replaceAll("&amp;", " ").replaceAll("&minus;", " ").replaceAll("([0-9]+)", " $1 ").replaceAll("[^\\p{ASCII}]|\\p{Punct}", " ").replaceAll("\\n", " ").replaceAll("(?m)\\s+", " ").trim();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String finalCleanup(String str) {
        return str.toLowerCase();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean checkNumbers(String str, String str2) {
        return (getNumbers(str).equals(getNumbers(str2)) && getRomans(str).equals(getRomans(str2))) ? false : true;
    }

    protected String getRomans(String str) {
        StringBuilder sb = new StringBuilder();
        for (String str2 : str.split(" ")) {
            sb.append(isRoman(str2) ? str2 : "");
        }
        return sb.toString();
    }

    protected boolean isRoman(String str) {
        return str.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getNumbers(String str) {
        return str.replaceAll("\\D", "");
    }

    protected String fixAliases(String str) {
        StringBuilder sb = new StringBuilder();
        UnmodifiableIterator<Character> it = Lists.charactersOf(str).iterator();
        while (it.hasNext()) {
            char charValue = it.next().charValue();
            int indexOf = StringUtils.indexOf(aliases_from, charValue);
            sb.append(indexOf >= 0 ? aliases_to.charAt(indexOf) : charValue);
        }
        return sb.toString();
    }

    protected String removeSymbols(String str) {
        StringBuilder sb = new StringBuilder();
        UnmodifiableIterator<Character> it = Lists.charactersOf(str).iterator();
        while (it.hasNext()) {
            char charValue = it.next().charValue();
            sb.append(StringUtils.contains(alpha, charValue) ? Character.valueOf(charValue) : " ");
        }
        return sb.toString().replaceAll("\\s+", " ");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getFirstValue(Field field) {
        if (field == null || Iterables.isEmpty(field)) {
            return null;
        }
        return ((Field) Iterables.getFirst(field, EMPTY_FIELD)).stringValue();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean notNull(String str) {
        return str != null;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String normalize(String str) {
        return nfd(str).toLowerCase().replaceAll("(\\W|\\p{InCombiningDiacriticalMarks}|\\p{Punct}|\\d|\\n)+", " ").trim();
    }

    private String nfd(String str) {
        return Normalizer.normalize(str, Normalizer.Form.NFD);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String filterStopWords(String str, Set<String> set) {
        StringTokenizer stringTokenizer = new StringTokenizer(str);
        StringBuilder sb = new StringBuilder();
        while (stringTokenizer.hasMoreTokens()) {
            String nextToken = stringTokenizer.nextToken();
            if (!set.contains(nextToken)) {
                sb.append(nextToken);
                sb.append(" ");
            }
        }
        return sb.toString().trim();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Collection<String> filterBlacklisted(Collection<String> collection, Set<String> set) {
        LinkedHashSet newLinkedHashSet = Sets.newLinkedHashSet();
        for (String str : collection) {
            if (!set.contains(str)) {
                newLinkedHashSet.add(str);
            }
        }
        return newLinkedHashSet;
    }

    public static Set<String> loadFromClasspath(String str) {
        HashSet newHashSet = Sets.newHashSet();
        try {
            Iterator<String> it = IOUtils.readLines(NGramUtils.class.getResourceAsStream(str)).iterator();
            while (it.hasNext()) {
                newHashSet.add(it.next());
            }
            return newHashSet;
        } catch (Throwable th) {
            return Sets.newHashSet();
        }
    }
}
