package eu.dnetlib.data.mapreduce.hbase.index;

import java.io.IOException;
import java.util.Properties;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import eu.dnetlib.data.mapreduce.JobParams;
import eu.dnetlib.data.mapreduce.hbase.AbstractHBaseMapReduceJob;

public class PrepareFeedJob extends AbstractHBaseMapReduceJob {

	public final static byte[] bSEPARATOR = { '|' };

	@Override
	public Job setJobDetails(Job job, Properties p) {

		initMapper(job, getScan(p), p.getProperty(JobParams.HBASE_SOURCE_TABLE));

		job.setReducerClass(PrepareFeedReducer.class);
		job.setOutputFormatClass(SequenceFileOutputFormat.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		SequenceFileOutputFormat.setCompressOutput(job, false);
		//SequenceFileOutputFormat.setOutputPath(job, new Path(p.getProperty(JobParams.INDEX_SEQ_FILE)));

		Path path = new Path(p.getProperty(JobParams.INDEX_SEQ_FILE));

		deleteHdfsFile(job, path);

		SequenceFileOutputFormat.setOutputPath(job, path);
		//SequenceFileOutputFormat.setCompressOutput(job, true);

		job.setNumReduceTasks(500); //Integer.parseInt(p.getProperty(JobParams.INDEX_NUM_REDUCE_TASK)));

		job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
		job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);

		job.getConfiguration().setBoolean("mapreduce.map.speculative", false);
		job.getConfiguration().setBoolean("mapreduce.reduce.speculative", false);

		job.getConfiguration().setBoolean("mapred.compress.map.output", true);

		job.getConfiguration().set("dfs.blocksize", "16M");

		return job;
	}

	private Scan getScan(Properties p) {
		Scan scan = new Scan();
		scan.setCaching(100); // 1 is the default in Scan, which will be bad for MapReduce jobs
		scan.setCacheBlocks(false); // don't set to true for MR jobs

		return scan;
	}

	private void initMapper(final Job job, final Scan scan, final String sourceTable) {
		try {
			TableMapReduceUtil.initTableMapperJob(sourceTable, scan, PrepareFeedMapper.class, Text.class, ImmutableBytesWritable.class, job);
		} catch (IOException e) {
			throw new RuntimeException(e);
		}
	}

}
