define avro_load_extracted_document_metadata
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_extracted_document_metadata');

define avro_load_citation
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_citation');

define avro_load_document_to_project
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_to_project');

define avro_load_document_to_dataset
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_to_dataset');

define avro_load_document_to_research_initiative
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_to_research_initiative');

define avro_load_document_to_document_clusters
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_to_document_clusters');

define avro_load_document_to_document_classes
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_to_document_classes');

define avro_load_document_to_document_statistics
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_to_document_statistics');

define avro_load_document_with_website_usage_similarities
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_with_website_usage_similarities');


define avro_store_document_with_inferenced_data
org.apache.pig.piggybank.storage.avro.AvroStorage(
'index', '0',
'output_schema_class', '$schema_output_document_with_inferenced_data');


define FIRST_NOT_NULL_STR eu.dnetlib.iis.transformers.udfs.StringFirstNotEmpty;
define NULL_EMPTY eu.dnetlib.iis.transformers.udfs.EmptyBagToNull;
define NULL_EMPTY_TUPLE_FIELDS eu.dnetlib.iis.transformers.udfs.NullTupleFieldsToNull;
define CREATE_ARRAY eu.dnetlib.iis.transformers.udfs.NullToEmptyBag;

extractedDocument = load '$input_extracted_document_metadata' using avro_load_extracted_document_metadata;
citation = load '$input_citation' using avro_load_citation;
documentToProject = load '$input_document_to_project' using avro_load_document_to_project;
documentToDataset = load '$input_document_to_dataset' using avro_load_document_to_dataset;
documentToResearchInitiative = load '$input_document_to_research_initiative' using avro_load_document_to_research_initiative;
documentToDocumentClusters = load '$input_document_to_document_clusters' using avro_load_document_to_document_clusters;
documentToDocumentClasses = load '$input_document_to_document_classes' using avro_load_document_to_document_classes;
documentToDocumentStatistics = load '$input_document_to_document_statistics' using avro_load_document_to_document_statistics;
documentWithWebsiteUsageSimilarities = load '$input_document_with_website_usage_similarities' using avro_load_document_with_website_usage_similarities;

documentToProjectGroupped = group documentToProject by documentId;
documentToProjectWithArrays = foreach documentToProjectGroupped {
    projectIds = foreach documentToProject generate projectId;
    generate group as id, projectIds;
}

documentToDatasetGroupped = group documentToDataset by documentId;
documentToDatasetWithArrays = foreach documentToDatasetGroupped {
    datasetIds = foreach documentToDataset generate datasetId;
    generate group as id, datasetIds;
}

citationGroupped = group citation by sourceDocumentId;
citationGrouppedWithText = foreach citationGroupped {
    idWithText = foreach citation generate destinationDocumentId as id, rawText as text;
    generate group as id, idWithText as citations;
}

joined1 = join documentToProjectWithArrays by id full, citationGrouppedWithText by id;
joined1Cleaned = foreach joined1 generate
    FIRST_NOT_NULL_STR(documentToProjectWithArrays::id, citationGrouppedWithText::id) as id,
    NULL_EMPTY(null) as authorIds,
    citationGrouppedWithText::citations as matchedCitationDocumentIds,
    NULL_EMPTY(documentToProjectWithArrays::projectIds) as projectIds;

joined2 = join joined1Cleaned by id full, documentToDatasetWithArrays by id;
joined2Cleaned = foreach joined2 generate
    FIRST_NOT_NULL_STR(joined1Cleaned::id, documentToDatasetWithArrays::id) as id,
    joined1Cleaned::authorIds as authorIds,
    joined1Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
    joined1Cleaned::projectIds as projectIds,
    NULL_EMPTY(documentToDatasetWithArrays::datasetIds) as datasetIds;

joined3 = join joined2Cleaned by id full, documentToDocumentClusters by documentId;
joined3Cleaned = foreach joined3 generate
    FIRST_NOT_NULL_STR(joined2Cleaned::id, documentToDocumentClusters::documentId) as id,
    joined2Cleaned::authorIds as authorIds,
    joined2Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
    joined2Cleaned::projectIds as projectIds,
    joined2Cleaned::datasetIds as datasetIds,
    documentToDocumentClusters::clusters as clusters;

joined4 = join joined3Cleaned by id full, documentToDocumentClasses by documentId;
joined4Cleaned = foreach joined4 generate
    FIRST_NOT_NULL_STR(joined3Cleaned::id, documentToDocumentClasses::documentId) as id,
    joined3Cleaned::authorIds as authorIds,
    joined3Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
    joined3Cleaned::projectIds as projectIds,
    joined3Cleaned::datasetIds as datasetIds,
    joined3Cleaned::clusters as clusters,
    documentToDocumentClasses::classes as classes;

joined5 = join joined4Cleaned by id full, documentToDocumentStatistics by documentId;
joined5Cleaned = foreach joined5 generate
    FIRST_NOT_NULL_STR(joined4Cleaned::id, documentToDocumentStatistics::documentId) as id,
    joined4Cleaned::authorIds as authorIds,
    joined4Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
    joined4Cleaned::projectIds as projectIds,
    joined4Cleaned::datasetIds as datasetIds,
    joined4Cleaned::clusters as clusters,
    joined4Cleaned::classes as classes,
    documentToDocumentStatistics::statistics as statistics;

documentWithWebsiteUsageSimilaritiesGroupped = group documentWithWebsiteUsageSimilarities by documentId;
outputSimilarities = foreach documentWithWebsiteUsageSimilaritiesGroupped {
    websiteUsageSimilarities = foreach documentWithWebsiteUsageSimilarities generate otherDocumentId as documentId, covisitedSimilarity as covisitedSimilarity;
    generate group as id, websiteUsageSimilarities;
}

joined6 = join joined5Cleaned by id full, outputSimilarities by id;
joined6Cleaned = foreach joined6 generate
    FIRST_NOT_NULL_STR(joined5Cleaned::id, outputSimilarities::id) as id,
    joined5Cleaned::authorIds as authorIds,
    joined5Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
    joined5Cleaned::projectIds as projectIds,
    joined5Cleaned::datasetIds as datasetIds,
    joined5Cleaned::clusters as clusters,
    joined5Cleaned::classes as classes,
    joined5Cleaned::statistics as statistics,
    outputSimilarities::websiteUsageSimilarities as websiteUsageSimilarities;

researchInitiativeGroupped = group documentToResearchInitiative by documentId;
researchInitiative = foreach researchInitiativeGroupped {
    ids = foreach documentToResearchInitiative generate egiConceptId;
    generate group as id, ids as researchInitiativeConceptIds;
}

joined7 = join joined6Cleaned by id full, researchInitiative by id;
joined7Cleaned = foreach joined7 generate
    FIRST_NOT_NULL_STR(joined6Cleaned::id, researchInitiative::id) as id,
    joined6Cleaned::authorIds as authorIds,
    joined6Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
    joined6Cleaned::projectIds as projectIds,
    joined6Cleaned::datasetIds as datasetIds,
    researchInitiative::researchInitiativeConceptIds as researchInitiativeConceptIds,
    joined6Cleaned::clusters as clusters,
    joined6Cleaned::classes as classes,
    joined6Cleaned::statistics as statistics,
    joined6Cleaned::websiteUsageSimilarities as websiteUsageSimilarities;

joinedFull = join extractedDocument by id full, joined7Cleaned by id;
joinedFullCleaned = foreach joinedFull generate 
    FIRST_NOT_NULL_STR(extractedDocument::id, joined7Cleaned::id) as id, 
    extractedDocument::title as title, extractedDocument::abstract as abstract, 
    extractedDocument::language as language, extractedDocument::keywords as keywords, 
    extractedDocument::externalIdentifiers as externalIdentifiers, 
    extractedDocument::journal as journal, extractedDocument::year as year, 
    extractedDocument::publisher as publisher, (chararray)null as text,
    joined7Cleaned::projectIds as projectIds,
    joined7Cleaned::authorIds as authorIds,
    joined7Cleaned::matchedCitationDocumentIds as matchedCitationDocumentIds,
    joined7Cleaned::datasetIds as referencedDataSetIds,
    joined7Cleaned::researchInitiativeConceptIds as researchInitiativeConceptIds,
    joined7Cleaned::clusters as clusters,
    joined7Cleaned::classes as classes,
    joined7Cleaned::statistics as statistics,
    joined7Cleaned::websiteUsageSimilarities as websiteUsageSimilarities;

store joinedFullCleaned into '$output_document_with_inferenced_data' using avro_store_document_with_inferenced_data;
