define avro_load_document_with_inferenced_data
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_with_inferenced_data');

define avro_load_document_id
org.apache.pig.piggybank.storage.avro.AvroStorage(
'input_schema_class', '$schema_input_document_id');


define avro_store_identifier
org.apache.pig.piggybank.storage.avro.AvroStorage(
'index', '0',
'output_schema_class', '$schema_output_identifier');


documentWithInferencedData = load '$input_document_with_inferenced_data' using avro_load_document_with_inferenced_data;
documentId = load '$input_document_id' using avro_load_document_id;
documentId = foreach documentId generate $0 as id;

documentWithInferencedDataId = foreach documentWithInferencedData generate flatten(referencedDataSetIds) as id;
documentWithInferencedDataIdNotNull = filter documentWithInferencedDataId by id is not null;
documentWithInferencedDataIdDistinct = distinct documentWithInferencedDataIdNotNull;

joined = join documentWithInferencedDataIdDistinct by id left, documentId by id;
joinedFiltered = filter joined by documentId::id is null;
identifiers = foreach joinedFiltered generate documentWithInferencedDataIdDistinct::id as id;

store identifiers into '$output_identifier' using avro_store_identifier;
