package eu.dnetlib.iis.ingest.pmc.citations;

import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_DOI_FIELD;
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_FIELD;
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_IDS_FIELD;
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.DST_PMID_FIELD;
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.POSITION_FIELD;
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.RAW_TEXT_FIELD;
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.SRC_FIELD;

import org.apache.avro.util.Utf8;

import org.codehaus.jettison.json.JSONObject;

import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata;
import eu.dnetlib.iis.ingest.pmc.metadata.schemas.ReferenceMetadata;

/**
 * Extracts Resolved citations from an NLM file. 
 * Input pipe should contain a field named TEXT_FIELD and ID_FIELD.
 *
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
 * @author mhorst
 */
public class CitationExtractorFunction extends BaseOperation implements Function {
    private final static Fields fields =
            new Fields(SRC_FIELD, POSITION_FIELD, RAW_TEXT_FIELD, 
            		DST_FIELD, DST_IDS_FIELD, DST_DOI_FIELD, DST_PMID_FIELD);

    private static final Utf8 PMID_KEY_UTF8 = new Utf8("pmid");
    private static final Utf8 DOI_KEY_UTF8 = new Utf8("doi");
    
    public  CitationExtractorFunction() {
        super(1, fields);
    }

    @Override
    public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
        try {
        	Tuple docTuple = functionCall.getArguments().getTuple();
        	ExtractedDocumentMetadata meta = (ExtractedDocumentMetadata) docTuple.iterator().next();
            if (meta.getReferences()!=null) {
            	for (ReferenceMetadata refMeta : meta.getReferences()) {
            		 Tuple result = new Tuple();
            		 result.addString(meta.getId().toString());
            		 result.addInteger(refMeta.getPosition());
            		 result.addString(refMeta.getText()!=null?refMeta.getText().toString():null);
            		 result.addString(null);
            		 if (refMeta.getBasicMetadata()!=null && 
            				 !refMeta.getBasicMetadata().getExternalIds().isEmpty()) {
            			 result.addString(new JSONObject(
            					 refMeta.getBasicMetadata().getExternalIds()).toString());
            			 CharSequence doiCharSeq = refMeta.getBasicMetadata().getExternalIds().get(DOI_KEY_UTF8);
                		 result.addString(doiCharSeq!=null?doiCharSeq.toString():null);
                		 CharSequence pmidCharSeq = refMeta.getBasicMetadata().getExternalIds().get(PMID_KEY_UTF8);
                		 result.addString(pmidCharSeq!=null?pmidCharSeq.toString():null);
            		 } else {
            			 result.addString(null);
            			 result.addString(null);
            			 result.addString(null);
            		 }
            		 functionCall.getOutputCollector().add(result);
            	}
            }
        } catch (Exception e) {
        	throw new RuntimeException(e);
        } 
    }
}
