/*
 * Decompiled with CFR 0.152.
 */
package eu.dnetlib.pace.common;

import com.google.common.base.Joiner;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.collect.UnmodifiableIterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
import java.io.InputStream;
import java.text.Normalizer;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;

public abstract class AbstractPaceFunctions {
    protected static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
    protected static Set<String> ngramBlacklist = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
    private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
    private static final String aliases_from = "\u2070\u00b9\u00b2\u00b3\u2074\u2075\u2076\u2077\u2078\u2079\u207a\u207b\u207c\u207d\u207e\u207f\u2080\u2081\u2082\u2083\u2084\u2085\u2086\u2087\u2088\u2089\u208a\u208b\u208c\u208d\u208e";
    private static final String aliases_to = "0123456789+-=()n0123456789+-=()";
    protected static final FieldList EMPTY_FIELD = new FieldListImpl();

    protected String concat(List<String> l) {
        return Joiner.on((String)" ").skipNulls().join(l);
    }

    protected String cleanup(String s) {
        String s1 = this.nfd(s);
        String s2 = this.fixAliases(s1);
        String s3 = s2.replaceAll("&ndash;", " ");
        String s4 = s3.replaceAll("&amp;", " ");
        String s5 = s4.replaceAll("&minus;", " ");
        String s6 = s5.replaceAll("([0-9]+)", " $1 ");
        String s7 = s6.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " ");
        String s8 = s7.replaceAll("\\n", " ");
        String s9 = s8.replaceAll("(?m)\\s+", " ");
        String s10 = s9.trim();
        return s10;
    }

    protected String finalCleanup(String s) {
        return s.toLowerCase();
    }

    protected boolean checkNumbers(String a, String b) {
        String numbersA = this.getNumbers(a);
        String numbersB = this.getNumbers(b);
        String romansA = this.getRomans(a);
        String romansB = this.getRomans(b);
        return !numbersA.equals(numbersB) || !romansA.equals(romansB);
    }

    protected String getRomans(String s) {
        StringBuilder sb = new StringBuilder();
        for (String t : s.split(" ")) {
            sb.append(this.isRoman(t) ? t : "");
        }
        return sb.toString();
    }

    protected boolean isRoman(String s) {
        return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
    }

    protected String getNumbers(String s) {
        return s.replaceAll("\\D", "");
    }

    protected String fixAliases(String s) {
        StringBuilder sb = new StringBuilder();
        UnmodifiableIterator unmodifiableIterator = Lists.charactersOf((String)s).iterator();
        while (unmodifiableIterator.hasNext()) {
            char ch = ((Character)unmodifiableIterator.next()).charValue();
            int i = StringUtils.indexOf((CharSequence)aliases_from, (int)ch);
            sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
        }
        return sb.toString();
    }

    protected String removeSymbols(String s) {
        StringBuilder sb = new StringBuilder();
        UnmodifiableIterator unmodifiableIterator = Lists.charactersOf((String)s).iterator();
        while (unmodifiableIterator.hasNext()) {
            char ch = ((Character)unmodifiableIterator.next()).charValue();
            sb.append(StringUtils.contains((CharSequence)alpha, (int)ch) ? Character.valueOf(ch) : " ");
        }
        return sb.toString().replaceAll("\\s+", " ");
    }

    protected String getFirstValue(Field values) {
        return values != null && !Iterables.isEmpty((Iterable)values) ? ((Field)Iterables.getFirst((Iterable)values, (Object)EMPTY_FIELD)).stringValue() : null;
    }

    protected boolean notNull(String s) {
        return s != null;
    }

    protected String normalize(String s) {
        return this.nfd(s).toLowerCase().replaceAll("(\\W|\\p{InCombiningDiacriticalMarks}|\\p{Punct}|\\d|\\n)+", " ").trim();
    }

    private String nfd(String s) {
        return Normalizer.normalize(s, Normalizer.Form.NFD);
    }

    protected String filterStopWords(String s, Set<String> stopwords) {
        StringTokenizer st = new StringTokenizer(s);
        StringBuilder sb = new StringBuilder();
        while (st.hasMoreTokens()) {
            String token = st.nextToken();
            if (stopwords.contains(token)) continue;
            sb.append(token);
            sb.append(" ");
        }
        return sb.toString().trim();
    }

    protected Collection<String> filterBlacklisted(Collection<String> set, Set<String> ngramBlacklist) {
        LinkedHashSet newset = Sets.newLinkedHashSet();
        for (String s : set) {
            if (ngramBlacklist.contains(s)) continue;
            newset.add(s);
        }
        return newset;
    }

    public static Set<String> loadFromClasspath(String classpath) {
        HashSet h = Sets.newHashSet();
        try {
            for (String s : IOUtils.readLines((InputStream)NGramUtils.class.getResourceAsStream(classpath))) {
                h.add(s);
            }
        }
        catch (Throwable e) {
            return Sets.newHashSet();
        }
        return h;
    }
}

