package eu.dnetlib.enabling.manager.msro.hadoop;

import java.io.IOException;
import java.io.StringReader;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.bson.BSONObject;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.google.common.collect.Lists;

public class HopeTagMapper extends Mapper<Object, BSONObject, Text, Text> {

	private static final Log log = LogFactory.getLog(HopeTagMapper.class); // NOPMD by marko on 11/24/08 5:02 PM

	private final Text mapValue = new Text();
	private final Text mapKey = new Text();
	private final DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();

	@Override
	public void map(Object key, BSONObject value, Context context) throws IOException, InterruptedException {
		try {
			DocumentBuilder builder = domFactory.newDocumentBuilder();
			final String identifier = value.get("id").toString();
			final String bodyContent = value.get("body").toString();
			if (identifier.startsWith("tags_")) {
				//a tag object record
				final String mdIdentifier = identifier.substring(5);
				mapKey.set(mdIdentifier);
				Document doc = builder.parse(new InputSource(new StringReader(bodyContent)));
				NodeList tagNodes = doc.getElementsByTagName("tags");
				List<String> tags = findTags(tagNodes);
				for (String t : tags) {
					//log.debug("Emitting " + mdIdentifier + "--> " + t);
					mapValue.set(t);
					context.write(mapKey, mapValue);
				}
			} else {
				//log.debug("Emitting " + identifier + "--> " + bodyContent);
				mapKey.set(identifier);
				mapValue.set(bodyContent);
				context.write(mapKey, mapValue);
			}
		} catch (ParserConfigurationException e) {
			throw new RuntimeException(e);
		} catch (SAXException e) {
			throw new RuntimeException(e);
		}

	}

	private List<String> findTags(NodeList tagNodes) {
		List<String> res = Lists.newArrayList();
		for (int i = 0; i < tagNodes.getLength(); i++) {
			Node t = tagNodes.item(i);
			String tagType = t.getAttributes().getNamedItem("kind").getNodeValue();
			if (tagType.equals("theme")) {
				for (int j = 0; j < t.getChildNodes().getLength(); j++) {
					String themeString = t.getChildNodes().item(j).getTextContent();
					if (!themeString.trim().isEmpty())
						res.add("theme::-::" + themeString);
				}

			} else {
				if (tagType.equals("exporttag")) {
					for (int j = 0; j < t.getChildNodes().getLength(); j++) {
						String exportString = t.getChildNodes().item(j).getTextContent();
						if (!exportString.isEmpty())
							res.add("export::-::" + exportString);
					}

				}
			}
		}
		return res;
	}

}
