package eu.dnetlib.pace.common;

import java.text.Normalizer;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

import eu.dnetlib.pace.clustering.NGramUtils;

/**
 * Set of common functions
 * 
 * @author claudio
 *
 */
public abstract class AbstractPaceFunctions {
	
	protected static Set<String> stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");

	protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");	

	private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
	private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎";
	private static final String aliases_to = "0123456789+-=()n0123456789+-=()";
	
	protected String concat(List<String> l) {
		return Joiner.on(" ").skipNulls().join(l);
	}

	protected String cleanup(String s) {
		return removeSymbols(fixAliases(s).replaceAll("&ndash;", " ").replaceAll("&amp;", " ").replaceAll("&minus;", " ").replaceAll("([0-9]+)", " $1 ")
				.trim().replaceAll("(?m)\\s+", " ").replaceAll("\\n", " "));
	}
	
	protected boolean checkNumbers(String a, String b) {
		return !getNumbers(a).equals(getNumbers(b)) || !getRomans(a).equals(getRomans(b));
	}

	protected String getRomans(String s) {
		final StringBuilder sb = new StringBuilder();
		for (String t : s.split(" ")) {
			sb.append(isRoman(t) ? t : "");
		}
		return sb.toString();
	}

	protected boolean isRoman(String s) {
		return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
	}

	protected String getNumbers(String s) {
		return s.replaceAll("\\D", "");
	}

	protected String fixAliases(String s) {
		final StringBuilder sb = new StringBuilder();
		for (char ch : Lists.charactersOf(s)) {
			int i = StringUtils.indexOf(aliases_from, ch);
			sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
		}
		return sb.toString();
	}

	protected String removeSymbols(String s) {
		final StringBuilder sb = new StringBuilder();

		for (char ch : Lists.charactersOf(s)) {
			sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
		}
		return sb.toString().replaceAll("\\s+", " ");
	}
	
	/////////////////////////
	
	protected String normalize(String s) {
		return Normalizer.normalize(s, Normalizer.Form.NFD).toLowerCase().replaceAll("(\\W|\\p{InCombiningDiacriticalMarks}|\\p{Punct}|\\d|\\n)+", " ")
				.trim();
	}
	
	protected String filterStopWords(String s, Set<String> stopwords) {
		final StringTokenizer st = new StringTokenizer(s);
		final StringBuilder sb = new StringBuilder();
		while (st.hasMoreTokens()) {
			final String token = st.nextToken();
			if (!stopwords.contains(token)) {
				sb.append(token);
				sb.append(" ");
			}
		}
		return sb.toString().trim();
	}
	
	protected Collection<String> filterBlacklisted(Collection<String> set, Set<String> ngramBlacklist) {
		final Set<String> newset = Sets.newLinkedHashSet();
		for (String s : set) {
			if (!ngramBlacklist.contains(s)) {
				newset.add(s);
			}
		}
		return newset;
	}
	
	//////////////////////
	
	public static Set<String> loadFromClasspath(String classpath) {
		final Set<String> h = Sets.newHashSet();
		try {
			for (String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
				h.add(s);
			}
		} catch (Throwable e) {
			return Sets.newHashSet();
		}
		return h;
	}	
	
}
