package eu.dnetlib.data.mapreduce.hbase.dedup;

import java.io.IOException;
import java.util.Queue;
import java.util.stream.Stream;

import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import eu.dnetlib.data.graph.model.DNGFDecoder;
import eu.dnetlib.data.graph.model.DNGFRowKeyDecoder;
import eu.dnetlib.data.mapreduce.JobParams;

import eu.dnetlib.data.mapreduce.util.dao.HBaseTableDAO;
import eu.dnetlib.data.proto.DNGFProtos.DNGF;
import eu.dnetlib.data.proto.DNGFProtos.DNGFRel;
import eu.dnetlib.data.proto.KindProtos.Kind;
import eu.dnetlib.data.proto.TypeProtos.Type;
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
import eu.dnetlib.pace.config.DedupConfig;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;

public class SimpleDedupPersonReducer extends TableReducer<Text, ImmutableBytesWritable, ImmutableBytesWritable> {

	private static final int MAX_Q_SIZE = 3000;

	private DedupConfig dedupConf;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {
		dedupConf = DedupConfig.load(context.getConfiguration().get(JobParams.DEDUP_CONF));
	}

	@Override
	protected void reduce(final Text key, final Iterable<ImmutableBytesWritable> values, final Context context) throws IOException, InterruptedException {
		try {
			final Queue<DNGFDecoder> q = prepare(key, values, context);
			if (q.size() > 1) {

				if (q.size() < JobParams.MAX_COUNTERS) {
					context.getCounter(dedupConf.getWf().getEntityType() + " root group size", lpad(q.size())).increment(1);
				} else {
					context.getCounter(dedupConf.getWf().getEntityType() + " root group size", "> " + JobParams.MAX_COUNTERS).increment(1);
				}
				final String min = q.stream().map(HBaseTableDAO.idDecoder()).min(String::compareTo).get();
				if (min == null) {
					context.getCounter(dedupConf.getWf().getEntityType(), "unable to find min").increment(1);
					return;
				}
				final String rootId = HBaseTableDAO.newId(min, dedupConf.getWf().getDedupRun());

				while (!q.isEmpty()) {
					markDuplicate(context, rootId, q.remove());
				}
			} else {
				context.getCounter(dedupConf.getWf().getEntityType(), "1").increment(1);
			}
		} catch (final Throwable e) {
			System.out.println("GOT EX " + e);
			e.printStackTrace(System.err);
			context.getCounter(dedupConf.getWf().getEntityType(), e.getClass().toString()).increment(1);
		}
	}

	private Queue<DNGFDecoder> prepare(final Text key, final Iterable<ImmutableBytesWritable> values, final Context context) {
		final Queue<DNGFDecoder> q = Lists.newLinkedList();
		for (final DNGFDecoder decoder : Iterables.transform(values, new Function<ImmutableBytesWritable, DNGFDecoder>() {
			@Override
			public DNGFDecoder apply(final ImmutableBytesWritable ibw) {
				return DNGFDecoder.decode(ibw.copyBytes());
			}
		})) {
			q.add(decoder);
			if (q.size() >= MAX_Q_SIZE) {
				context.getCounter("[" + key.toString() + "]", "size > " + MAX_Q_SIZE).increment(1);
				break;
			}
		}
		return q;
	}

	private void markDuplicate(final Context context, final String rootId, final DNGFDecoder decoder) throws IOException,
			InterruptedException {

		final DNGF.Builder builder = DNGF.newBuilder(decoder.getDNGF());
		builder.getDataInfoBuilder().setDeletedbyinference(true).setInferenceprovenance(dedupConf.getWf().getConfigurationId());

		final DNGF oaf = builder.build();
		final String oafId = oaf.getEntity().getId();

		// writes the body, marked as deleted
		final String entityName = dedupConf.getWf().getEntityType();
		emit(context, Bytes.toBytes(oafId), HBaseTableDAO.cfMetadata(), Bytes.toBytes(entityName), oaf.toByteArray());
		context.getCounter(entityName, "marked as deleted").increment(1);

		// writes the dedupRels in both directions
		final Type entityType = Type.valueOf(entityName);
		final byte[] rowkey = Bytes.toBytes(rootId);

		final byte[] merges = HBaseTableDAO.getDedupQualifier_mergesBytes(entityType, oafId);
		emit(context, rowkey, HBaseTableDAO.cfRels(), merges, buildRel(rowkey, Bytes.toBytes(oafId), "merges"));

		final byte[] mergedIn = HBaseTableDAO.getDedupQualifier_mergedInBytes(entityType, new String(rowkey));
		emit(context, Bytes.toBytes(oafId), HBaseTableDAO.cfRels(), mergedIn, buildRel(Bytes.toBytes(oafId), rowkey, "isMergedIn"));

		context.getCounter(entityName, new String(merges)).increment(1);
		context.getCounter(entityName, new String(mergedIn)).increment(1);
	}

	private void emit(final Context context, final byte[] rowkey, final String family, final byte[] qualifier, final byte[] value) throws IOException,
			InterruptedException {

		final Put put = new Put(DNGFRowKeyDecoder.decode(rowkey).getKey().getBytes());
		put.setWriteToWAL(JobParams.WRITE_TO_WAL);
		put.add(Bytes.toBytes(family), qualifier, value);

		context.write(new ImmutableBytesWritable(rowkey), put);
	}

	private byte[] buildRel(final byte[] from, final byte[] to, final String relClass) {
		final DNGFRel.Builder oafRel = HBaseTableDAO.getDedup(new String(from), new String(to), relClass);
		final DNGF oaf =
				DNGF.newBuilder()
				.setKind(Kind.relation)
				.setLastupdatetimestamp(System.currentTimeMillis())
						.setDataInfo(
								AbstractDNetXsltFunctions.getDataInfo(null, "", "0.8", false, true).setInferenceprovenance(
								dedupConf.getWf().getConfigurationId())).setRel(oafRel)
								.build();
		return oaf.toByteArray();
	}

	private String lpad(final int s) {
		return StringUtils.leftPad(String.valueOf(s), String.valueOf(MAX_Q_SIZE).length());
	}

}
