package eu.dnetlib.data.mapreduce.hbase.dedup;

import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;

import com.google.common.collect.Maps;
import com.google.protobuf.GeneratedMessage;

import eu.dnetlib.data.mapreduce.util.DedupUtils;
import eu.dnetlib.data.mapreduce.util.OafDecoder;
import eu.dnetlib.data.mapreduce.util.OafEntityDecoder;
import eu.dnetlib.data.proto.OafProtos.OafEntity;
import eu.dnetlib.data.proto.TypeProtos.Type;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DynConf;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import eu.dnetlib.pace.util.DedupConfig;
import eu.dnetlib.pace.util.DedupConfigLoader;

public class DedupMapper extends TableMapper<Text, ImmutableBytesWritable> {

	private Config paceConf;

	private DedupConfig dedupConf;

	private Map<String, Set<String>> blackListMap = Maps.newHashMap();

	private Text outKey;

	private ImmutableBytesWritable ibw;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {

		paceConf = DynConf.load(context.getConfiguration().get("dedup.pace.conf"));
		dedupConf = DedupConfigLoader.load(context.getConfiguration().get("dedup.wf.conf"));
		blackListMap = paceConf.blacklists();

		outKey = new Text();
		ibw = new ImmutableBytesWritable();

		System.out.println("dedup map phase \npace conf: " + paceConf.fields() + "\nwf conf: " + dedupConf.toString() + "\nblacklists: " + blackListMap);
	}

	@Override
	protected void map(final ImmutableBytesWritable keyIn, final Result result, final Context context) throws IOException, InterruptedException {
		// System.out.println("got key: " + new String(keyIn.copyBytes()));

		byte[] body = result.getValue(dedupConf.getEntityType().getBytes(), DedupUtils.BODY_B);

		if (body != null) {

			final OafEntity entity = OafDecoder.decode(body).getEntity();
			if (entity.getType().equals(dedupConf.getEntityType())) {

				// TODO: remove this hack - here because we don't want to dedup datasets
				if (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("dataset")) // System.out.println("avoid to dedup dataset!");
					return;

				GeneratedMessage metadata = OafEntityDecoder.decode(entity).getMetadata();
				MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), metadata, paceConf.fields());
				emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, paceConf, blackListMap));
			}
		} else {
			context.getCounter(dedupConf.getEntityType(), "missing body").increment(1);
		}
	}

	private void emitNGrams(final Context context, final MapDocument doc, final Collection<String> collection) throws IOException, InterruptedException {
		for (String ngram : collection) {
			outKey.set(ngram);
			ibw.set(doc.toByteArray());
			context.write(outKey, ibw);
		}
	}

}
