define avro_load_person
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_person');

define avro_load_metadata
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_metadata');

define avro_store_citation_metadata
org.apache.pig.piggybank.storage.avro.AvroStorage(
'index', '0',
'output_schema_class', '$schema_output_citation_metadata');

define CREATE_ARRAY eu.dnetlib.iis.transformers.udfs.NullToEmptyBag;

person = load '$input_person' using avro_load_person;
metadata = load '$input_metadata' using avro_load_metadata;

docWithFlatAuthors = foreach metadata generate id, flatten(authorIds) as authorId;
docWithAuthors = join docWithFlatAuthors by authorId left, person by id;
docWithFullnames = foreach docWithAuthors generate docWithFlatAuthors::id as id, person::fullname as author;
grouppedDocWithFullnames = group docWithFullnames by id;
docWithFullnamesArray = foreach grouppedDocWithFullnames {
    authorsNotNull = filter docWithFullnames by author is not null;
    authors = foreach authorsNotNull generate author;
    generate group as id, authors;
}

docFlatMetadataWithStructuredPages = foreach metadata generate 
    id, (chararray)year, title, journal, references, flatten(pages) as (start, end);
docFlatMetadata = foreach docFlatMetadataWithStructuredPages generate 
    id, title, journal, references, CONCAT(CONCAT(start, '-'), end) as pages, year;

joined = join docFlatMetadata by id, docWithFullnamesArray by id;
renamedJoined = foreach joined generate 
    docFlatMetadata::id as id,
    docWithFullnamesArray::authors as authors,
    docFlatMetadata::title as title,
    docFlatMetadata::journal as journal,
    docFlatMetadata::pages as pages,
    docFlatMetadata::year as year,
    docFlatMetadata::references as references;
docBasicMetadataAndRefs = foreach renamedJoined generate 
    id, references, (authors, title, journal, pages, year) as basicMetadata;

docCitationMetadata = foreach docBasicMetadataAndRefs {
    refsWithFlatMeta = foreach references generate position, flatten(basicMetadata), text;
    refsWithFlatPages = foreach refsWithFlatMeta generate position, 
        basicMetadata::authors as authors, basicMetadata::title as title, 
        basicMetadata::source as journal, basicMetadata::year as year, text as rawText,
        flatten(basicMetadata::pages) as (start, end);
    flatRefs = foreach refsWithFlatPages generate position, title, journal, year, rawText,
        CREATE_ARRAY(authors) as authors, CONCAT(CONCAT(start, '-'), end) as pages;
    references = foreach flatRefs generate position, 
        (authors, title, journal, pages, year) as basicMetadata, rawText;
    generate id, basicMetadata, references as references;
};

store docCitationMetadata into '$output_citation_metadata' using avro_store_citation_metadata;
