package eu.dnetlib.data.mapreduce.hbase.dedup;

import java.io.IOException;
import java.util.Queue;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;

import eu.dnetlib.data.mapreduce.hbase.dedup.config.DedupConfig;
import eu.dnetlib.data.mapreduce.hbase.dedup.config.DedupConfigLoader;
import eu.dnetlib.data.mapreduce.util.DedupRootUtils;
import eu.dnetlib.data.mapreduce.util.OafDecoder;
import eu.dnetlib.data.mapreduce.util.OafDecoderImporter;
import eu.dnetlib.data.mapreduce.util.OafEntityIdReader;
import eu.dnetlib.data.proto.DedupRelProtos.DedupRel;
import eu.dnetlib.data.proto.KindProtos.Kind;
import eu.dnetlib.data.proto.OafProtos.Oaf;
import eu.dnetlib.data.proto.OafProtos.OafRel;
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
import eu.dnetlib.data.transform.xml.AbstractDNetOafXsltFunctions;

public class DedupPersonReducer extends TableReducer<Text, ImmutableBytesWritable, ImmutableBytesWritable> {

	private static final boolean WRITE_TO_WAL = false;

	private static final int MAX_Q_SIZE = 3000;

	private DedupConfig dedupConf;

	@Override
	protected void setup(Context context) throws IOException, InterruptedException {
		dedupConf = DedupConfigLoader.load(context.getConfiguration().get("dedup.wf.conf"));
	}

	@Override
	protected void reduce(Text key, Iterable<ImmutableBytesWritable> values, Context context) throws IOException, InterruptedException {
		try {
			Queue<OafDecoder> q = prepare(key, values, context);
			if (q.size() > 1) {
				context.getCounter(dedupConf.getEntityName() + " count", lpad(q.size())).increment(1);
				String min = findMin(Iterables.transform(q, new OafEntityIdReader()));
				if (min == null) {
					context.getCounter(dedupConf.getEntityName(), "unable to find min").increment(1);
					return;
				}
				String rootId = DedupRootUtils.newId(min, dedupConf.getDedupRun());

				while (!q.isEmpty()) {
					markDuplicate(context, rootId, q.remove());
				}
			} else {
				context.getCounter(dedupConf.getEntityName(), "single instance").increment(1);
			}
		} catch (Throwable e) {
			System.out.println("GOT EX " + e);
			e.printStackTrace(System.err);
			context.getCounter(dedupConf.getEntityName(), e.getClass().toString()).increment(1);
		}
	}

	private Queue<OafDecoder> prepare(Text key, Iterable<ImmutableBytesWritable> values, Context context) {
		Queue<OafDecoder> q = Lists.newLinkedList();
		for (OafDecoder decoder : Iterables.transform(values, new OafDecoderImporter())) {
			q.add(decoder);
			if (q.size() >= MAX_Q_SIZE) {
				context.getCounter("[" + key.toString() + "]", "size > " + MAX_Q_SIZE).increment(1);
				break;
			}
		}
		return q;
	}

	public static String findMin(Iterable<String> keys) {
		String min = Iterables.getFirst(keys, null);
		for (String iq : keys) {
			if (min.compareTo(iq) > 0) {
				min = iq;
			}
		}
		return min;
	}

	private void markDuplicate(Context context, String rootId, OafDecoder decoder) throws InvalidProtocolBufferException, IOException,
			InterruptedException {

		Oaf.Builder builder = Oaf.newBuilder(decoder.getOaf());
		builder.getDataInfoBuilder().setDeletedbyinference(true).setInferenceprovenance("dedup person");

		Oaf oaf = builder.build();
		byte[] oafId = Bytes.toBytes(oaf.getEntity().getId());

		// writes the body, marked as deleted
		emit(context, oafId, dedupConf.getEntityName(), Bytes.toBytes("body"), oaf.toByteArray());
		context.getCounter(dedupConf.getEntityName(), "marked as deleted").increment(1);

		// writes the dedupRels in both directions
		emit(context, Bytes.toBytes(rootId), RelType.dedupRel.toString(), oafId, buildRel(Bytes.toBytes(rootId), oafId));
		emit(context, oafId, RelType.dedupRel.toString(), Bytes.toBytes(rootId), buildRel(oafId, Bytes.toBytes(rootId)));
		context.getCounter(dedupConf.getEntityName(), RelType.dedupRel.toString()).increment(2);
	}

	private void emit(Context context, byte[] rowkey, String family, byte[] qualifier, byte[] value) throws IOException, InterruptedException {
		Put put = new Put(rowkey);
		put.setWriteToWAL(WRITE_TO_WAL);
		put.add(Bytes.toBytes(family), qualifier, value);

		context.write(new ImmutableBytesWritable(rowkey), put);
	}

	private byte[] buildRel(byte[] from, byte[] to) {
		OafRel.Builder oafRel = OafRel.newBuilder().setRelType(RelType.dedupRel).setDedupRel(DedupRel.newBuilder()).setChild(false)
				.setSource(new String(from)).setTarget(new String(to));
		Oaf oaf = Oaf.newBuilder().setKind(Kind.relation).setTimestamp(System.currentTimeMillis())
				.setDataInfo(AbstractDNetOafXsltFunctions.getDataInfo(null, "0.8", false, true).setInferenceprovenance("dedup")).setRel(oafRel).build();
		return oaf.toByteArray();
	}

	private String lpad(int s) {
		return StringUtils.leftPad(String.valueOf(s), String.valueOf(MAX_Q_SIZE).length());
	}

}
