package eu.dnetlib.data.mapreduce.hbase.dedup;

import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;

import com.google.common.collect.Maps;
import com.google.protobuf.GeneratedMessage;

import eu.dnetlib.data.mapreduce.util.OafDecoder;
import eu.dnetlib.data.mapreduce.util.OafEntityDecoder;
import eu.dnetlib.data.proto.OafProtos.OafEntity;
import eu.dnetlib.data.proto.TypeProtos.Type;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DynConf;
import eu.dnetlib.pace.model.DocumentBuilder;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.DedupConfig;
import eu.dnetlib.pace.util.DedupConfigLoader;

public class DedupMapper extends TableMapper<Text, ImmutableBytesWritable> {

	private Config paceConf;

	private DedupConfig dedupConf;

	private Map<String, Set<String>> blackListMap = Maps.newHashMap();

	@Override
	protected void setup(Context context) throws IOException, InterruptedException {

		paceConf = DynConf.load(context.getConfiguration().get("dedup.pace.conf"));
		dedupConf = DedupConfigLoader.load(context.getConfiguration().get("dedup.wf.conf"));
		blackListMap = paceConf.blacklists();

		System.out.println("dedup map phase \npace conf: " + paceConf.fields() + "\nwf conf: " + dedupConf.toString() + "\nblacklists: " + blackListMap);
	}

	@Override
	protected void map(ImmutableBytesWritable keyIn, Result result, Context context) throws IOException, InterruptedException {
		//System.out.println("got key: " + new String(keyIn.copyBytes()));

		byte[] body = result.getValue(dedupConf.getEntityName().getBytes(), "body".getBytes());

		if (body != null) {

			try {
				final OafEntity entity = OafDecoder.decode(body).getEntity();
				if (entity.getType().equals(dedupConf.getEntityType())) {
					
					//TODO: remove this hack - here because we don't want to dedup datasets
					if (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("dataset")) {
						System.out.println("avoid to dedup dataset!");
						return;
					}
					
					GeneratedMessage metadata = OafEntityDecoder.decode(entity).getMetadata();
					MapDocument doc = DocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), metadata, paceConf.fields());
					emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, paceConf, blackListMap));
				}
			} catch (Throwable e) {
				System.out.println("GOT EX " + e);
				e.printStackTrace(System.out);
			}
		} else {
			context.getCounter(dedupConf.getEntityName(), "missing body").increment(1);
		}
	}

	private void emitNGrams(Context context, MapDocument doc, Collection<String> collection) throws IOException, InterruptedException {
		for (String ngram : collection) {
			context.write(new Text(ngram), new ImmutableBytesWritable(doc.toByteArray()));
		}
	}

}
