package eu.dnetlib.iis.ingest.pmc.citations;

import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.pipe.joiner.LeftJoin;
import cascading.tuple.Fields;
import static eu.dnetlib.iis.ingest.pmc.citations.Constants.*;

/**
 * The main workflow.
 *
 * @author Mateusz Fedoryszak (m.fedoryszak@icm.edu.pl)
 * @author mhorst
 * 
 */
public class ResolvedCitationsSubAssembly extends SubAssembly {
    public ResolvedCitationsSubAssembly(Pipe docPipe, Pipe dedupMapPipe, 
    		Pipe pmidToOaidPipe, Pipe doiToOaidPipe) {
        setPrevious(docPipe, dedupMapPipe, pmidToOaidPipe, doiToOaidPipe);

//      extract resolved citations from NLMs
        Pipe citationsPipe = new Each(docPipe, new CitationExtractorFunction(), Fields.RESULTS);

//      translate PMID target identifiers to openaire identifier in DST_FIELD
//      grouping citations and dedup entries by pmid
        Pipe matchingPipe = new CoGroup(
        		citationsPipe, new Fields(DST_PMID_FIELD), 
        		pmidToOaidPipe, new Fields(ORIGINAL_ID_FIELD), 
        		new LeftJoin());
//		replacing destination identifier with id from dedup mapping
        matchingPipe = new Each(
        		matchingPipe,
                new ReplacerFunction(new Fields(DST_FIELD), new Fields(NEW_ID_FIELD),
                        new Fields(SRC_FIELD, 
                        		POSITION_FIELD, RAW_TEXT_FIELD, DST_IDS_FIELD,
                        		DST_DOI_FIELD)),
                Fields.RESULTS);
//      translate DOI target identifiers to openaire identifier in DST_FIELD
//      grouping citations and dedup entries by DOI
        matchingPipe = new CoGroup(
        		matchingPipe, new Fields(DST_DOI_FIELD), 
        		doiToOaidPipe, new Fields(ORIGINAL_ID_FIELD), 
        		new LeftJoin());
//		replacing destination identifier with id from dedup mapping
        matchingPipe = new Each(
        		matchingPipe,
                new ReplacerFunction(new Fields(DST_FIELD), new Fields(NEW_ID_FIELD),
                        new Fields(SRC_FIELD,
                        		POSITION_FIELD, RAW_TEXT_FIELD, DST_IDS_FIELD)),
                Fields.RESULTS);
        
//		replace document ids for ids of deduplicated documents
        Pipe dedupPipe;
//      grouping citations and dedup entries by source id
        dedupPipe = new CoGroup(matchingPipe, new Fields(SRC_FIELD), dedupMapPipe,
                new Fields(ORIGINAL_ID_FIELD), new LeftJoin());
//		replacing source identifier with id from dedup mapping
        dedupPipe = new Each(
                dedupPipe,
                new ReplacerFunction(
                		new Fields(SRC_FIELD), new Fields(NEW_ID_FIELD),
                        new Fields(POSITION_FIELD, RAW_TEXT_FIELD, DST_FIELD, DST_IDS_FIELD)),
                Fields.RESULTS);
//      grouping citations and dedup entries by destination id
        dedupPipe = new CoGroup(dedupPipe, new Fields(DST_FIELD), dedupMapPipe,
                new Fields(ORIGINAL_ID_FIELD), new LeftJoin());
//		replacing destination identifier with id from dedup mapping
        dedupPipe = new Each(
                dedupPipe,
                new ReplacerFunction(
                		new Fields(DST_FIELD), new Fields(NEW_ID_FIELD),
                        new Fields(POSITION_FIELD, RAW_TEXT_FIELD, SRC_FIELD, DST_IDS_FIELD)),
                Fields.RESULTS);
        
//      mark citations to existent documents
//      FIXME I guess we don't need that step anymore, since identifiers are not generated
//      Pipe existenceMarkedPipe = new CoGroup(dedupPipe, new Fields(DST_FIELD), existentDocsIdsPipe,
//              new Fields(ID_FIELD), new LeftJoin());

        Pipe outPipe = new Each(dedupPipe, new CitationReshaperFuncion(), Fields.RESULTS);

        setTails(outPipe);
    }
}
