package eu.dnetlib.data.mapreduce.hbase.dedup;

import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import com.google.common.collect.Maps;
import eu.dnetlib.data.graph.model.DNGFDecoder;
import eu.dnetlib.data.mapreduce.JobParams;
import eu.dnetlib.data.mapreduce.util.dao.HBaseTableDAO;
import eu.dnetlib.data.proto.DNGFProtos.DNGFEntity;
import eu.dnetlib.data.proto.DliFieldTypeProtos;
import eu.dnetlib.data.proto.DliProtos;
import eu.dnetlib.data.proto.TypeProtos.Type;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;

public class DedupMapper extends TableMapper<Text, ImmutableBytesWritable> {

	private static final Log log = LogFactory.getLog(DedupMapper.class);

	private DedupConfig dedupConf;

	private Map<String, List<String>> blackListMap = Maps.newHashMap();

	private Text outKey;

	private ImmutableBytesWritable ibw;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {

		final String dedupConfJson = context.getConfiguration().get(JobParams.DEDUP_CONF);

		log.info("pace conf strings");
		log.info("pace conf: " + dedupConfJson);

		dedupConf = DedupConfig.load(dedupConfJson);

		blackListMap = dedupConf.getPace().getBlacklists();

		outKey = new Text();
		ibw = new ImmutableBytesWritable();

		log.info("pace conf");
		log.info("entity type: " + dedupConf.getWf().getEntityType());
		log.info("clustering: " + dedupConf.getPace().getClustering());
		log.info("conditions: " + dedupConf.getPace().getConditions());
		log.info("fields: " + dedupConf.getPace().getModel());
		log.info("blacklists: " + blackListMap);
		log.info("wf conf: " + dedupConf.toString());
	}

	@Override
	protected void map(final ImmutableBytesWritable keyIn, final Result result, final Context context) throws IOException, InterruptedException {
		// log.info("got key: " + new String(keyIn.copyBytes()));

		final byte[] body = result.getValue(HBaseTableDAO.cfMetadataByte(), dedupConf.getWf().getEntityType().getBytes());

		if (body != null) {

			final DNGFDecoder decoder = DNGFDecoder.decode(body, DliFieldTypeProtos.completionStatus, DliProtos.completionStatus, DliProtos.resolvedfrom, DliProtos.typedIdentifier);
			if (decoder.getDNGF().getDataInfo().getDeletedbyinference()) {
				context.getCounter(dedupConf.getWf().getEntityType(), "deleted by inference").increment(1);
				return;
			}

			final DNGFEntity entity = decoder.getEntity();

			context.getCounter(entity.getType().toString(), "decoded").increment(1);

			if (entity.getType().equals(Type.valueOf(dedupConf.getWf().getEntityType()))) {

				// GeneratedMessage metadata = DNGFEntityDecoder.decode(entity).getEntity();
				final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel());
				emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
			}
		} else {
			context.getCounter(dedupConf.getWf().getEntityType(), "missing body").increment(1);
		}
	}

	private void emitNGrams(final Context context, final MapDocument doc, final Collection<String> ngrams) throws IOException, InterruptedException {
		for (final String ngram : ngrams) {
			outKey.set(ngram);
			ibw.set(doc.toByteArray());
			context.write(outKey, ibw);
		}
	}

}
