define avro_load_document_metadata
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_metadata');

define avro_load_document_relation
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_relation');

define avro_load_document_with_inferenced_data
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_with_inferenced_data');


define avro_store_document_with_inferenced_data
org.apache.pig.piggybank.storage.avro.AvroStorage(
'index', '0',
'output_schema_class', '$schema_output_document_with_inferenced_data');


define STR_BAGS_DIFF eu.dnetlib.iis.transformers.udfs.StringBagsDifference;


documentMetadata = load '$input_document_metadata' using avro_load_document_metadata;
documentRelation = load '$input_document_relation' using avro_load_document_relation;
documentWithInferencedData = load '$input_document_with_inferenced_data' using avro_load_document_with_inferenced_data;

joinedSubs = join documentMetadata by id full, documentRelation by id;
joinedSubsCleaned = foreach joinedSubs generate
    documentMetadata::id as id,
    documentMetadata::projectIds as projectIds,
    documentRelation::referencedIds as referencedIds;

joinedWithSubs = join documentWithInferencedData by id left, joinedSubsCleaned by id;
joinedWithSubsCleaned = foreach joinedWithSubs generate
    documentWithInferencedData::id as id,
    documentWithInferencedData::title as title,
    documentWithInferencedData::abstract as abstract,
    documentWithInferencedData::language as language,
    documentWithInferencedData::keywords as keywords,
    documentWithInferencedData::externalIdentifiers as externalIdentifiers,
    documentWithInferencedData::journal as journal,
    documentWithInferencedData::year as year,
    documentWithInferencedData::publisher as publisher,   
    documentWithInferencedData::text as text,     
    STR_BAGS_DIFF(documentWithInferencedData::projectIds, joinedSubsCleaned::projectIds) as projectIds,
    documentWithInferencedData::authorIds as authorIds, 
    documentWithInferencedData::matchedCitationDocumentIds as matchedCitationDocumentIds,
    STR_BAGS_DIFF(documentWithInferencedData::referencedDataSetIds, joinedSubsCleaned::referencedIds) as referencedDataSetIds,
    documentWithInferencedData::researchInitiativeConceptIds as researchInitiativeConceptIds,
    documentWithInferencedData::clusters as clusters, 
    documentWithInferencedData::classes as classes, 
    documentWithInferencedData::statistics as statistics, 
    documentWithInferencedData::websiteUsageSimilarities as websiteUsageSimilarities;

store joinedWithSubsCleaned into '$output_document_with_inferenced_data' using avro_store_document_with_inferenced_data;
