package eu.dnetlib.data.mdstore.modular.plugin;

import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.google.common.base.Splitter;
import com.mongodb.DBObject;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import eu.dnetlib.data.mdstore.modular.action.DoneCallback;
import eu.dnetlib.data.mdstore.modular.action.MDStorePlugin;
import eu.dnetlib.data.mdstore.modular.connector.MDStoreDao;
import eu.dnetlib.data.mdstore.modular.mongodb.MDStoreDaoImpl;
import eu.dnetlib.data.mdstore.modular.mongodb.MongoMDStore;
import eu.dnetlib.rmi.data.MDStoreServiceException;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

/**
 * Created by claudio on 21/03/16.
 */
public class CreatorExtractor implements MDStorePlugin {

	public static final String REGEX_SUBJECT = "^(info:eu-repo)\\/(classification)\\/([a-zA-Z]*)\\/(.*)$";
	private static final Log log = LogFactory.getLog(CreatorExtractor.class);
	private static final String FLUSH_THRESHOLD = "1000";

	@Override
	public void run(final MDStoreDao dao, final Map<String, String> params, final DoneCallback doneCallback) throws MDStoreServiceException {

		final String mdId = params.get("mdId");
		if (StringUtils.isBlank(mdId)) {
			throw new MDStoreServiceException("missing param 'mdId'");
		}
		log.info("extract creators from mdStore: " + mdId);

		int ft = Integer.parseInt(params.get("flush.threshold") != null ? params.get("flush.threshold") : FLUSH_THRESHOLD);

		final MongoMDStore mdStore = (MongoMDStore) dao.getMDStore(mdId);

		final MongoDatabase db = ((MDStoreDaoImpl) dao).getDb();

		final String collectionName = StringUtils.substringBefore(mdId, "::") + "person";

		log.info("using collection: " + collectionName);

		final MongoCollection<org.bson.Document> collection = db.getCollection(collectionName);

		collection.drop();

		final SAXReader r = new SAXReader();

		final List<org.bson.Document> buffer = new ArrayList<>();

		try {
			for (DBObject dbo : mdStore.getCollection().find()) {

				final String resultId = (String) dbo.get("id");
				final String prefix = StringUtils.substringBefore(resultId, "::");

				final Document doc = r.read(new StringReader(dbo.get("body").toString()));

				final Map<String, Object> map = new HashMap<>();

				map.put("id", resultId);
				map.put("prefix", prefix);
				map.put("authors", parseCreators(resultId, doc));
				map.put("subjects", parseSubjects(doc));

				buffer.add(new org.bson.Document(resultId, map));

				if (buffer.size() > ft) {
					collection.insertMany(buffer);
					buffer.clear();
				}
			}
			collection.insertMany(buffer);

			doneCallback.call(new HashMap<String, String>());

		} catch (Exception e) {
			throw new MDStoreServiceException(e);
		}
	}

	private Map<String, String> parseCreators(final String resultId, final Document doc) throws UnsupportedEncodingException, NoSuchAlgorithmException {

		final List creatorNodes = doc.selectNodes("//*[local-name() = 'creator']");
		final Map<String, String> creatorMap = new HashMap<>();

		for (int i = 0; i < creatorNodes.size(); i++) {
			final Element e = (Element) creatorNodes.get(i);
			final String creator = e.getText();

			final String prefix = StringUtils.substringBefore(resultId, "::");
			final String id = prefix + "::" + md5(creator);

			creatorMap.put(id, creator);

		}

		return creatorMap;
	}

	private Map<String, List<String>> parseSubjects(final Document doc) {

		final List subjectNodes = doc.selectNodes("//*[local-name() = 'subject']");
		final Map<String, List<String>> subjectMap = new HashMap<>();

		for (int i = 0; i < subjectNodes.size(); i++) {
			final Element e = (Element) subjectNodes.get(i);
			final String subject = e.getText();

			final String type = guessType(subject);
			if (!subjectMap.containsKey(type)) {
				subjectMap.put(type, new ArrayList<String>());
			}

			if (StringUtils.isNotBlank(type)) {
				switch (type) {
				case "keyword":
					final Splitter splitter = Splitter.on(",").trimResults().omitEmptyStrings();
					for (String token : splitter.split(subject)) {
						subjectMap.get(type).add(token);
					}
					break;

				default:
					String token = subject.replaceFirst(REGEX_SUBJECT, "$4");

					if (StringUtils.isNotBlank(token)) {
						subjectMap.get(type).add(token);
					}
					break;
				}
			}
		}

		return subjectMap;
	}

	private String guessType(final String subject) {
		if (subject.startsWith("info:eu-repo")) {
			final String s = subject.replaceAll(REGEX_SUBJECT, "$3");
			return s;
		} else {
			return "keyword";
		}
	}

	public String md5(final String s) throws NoSuchAlgorithmException, UnsupportedEncodingException {
		final MessageDigest md = MessageDigest.getInstance("MD5");
		md.update(s.getBytes("UTF-8"));
		return new String(Hex.encodeHex(md.digest()));
	}
}
