#!/usr/bin/perl

use strict;
use utf8;
use warnings;
use XML::LibXML;
use Encode;
use Data::Dumper;
use JSON;
use String::Util qw(trim);
use URI::Encode qw(uri_encode);
use Text::CSV;

my $parser = XML::LibXML->new;

while(<>) {
	my $r = decode_json($_);

	my $id = $r->{'id'};
	my $xml = $r->{'record'};

	my $doc = $parser->parse_string($xml);
	my $root = $doc->documentElement();

	my $record = {};
	$record->{'openaireId'} = trim($root->findvalue('(//*[local-name()="objIdentifier"])[1]'));
	$record->{'originalId'} = trim($root->findvalue('(//originalId)[1]'));
	$record->{'title'} = trim($root->findvalue('(//title[@classid="main title"])[1]'));

	$record->{'authors'} = [];
	foreach ($root->findnodes('//creator')) {
		push @{$record->{'authors'}}, trim($_->textContent);
	}
	   
	$record->{'publisher'} = trim($root->findvalue('(//publisher)[1]'));
	$record->{'description'} = trim($root->findvalue('(//description)[1]'));
	$record->{'language'} = trim($root->findvalue('(//language/@classid)[1]'));

	$record->{'pids'} = [];
	foreach ($root->findnodes('//pid')) {
		push @{$record->{'pids'}}, { 'type' => trim($_->findvalue('@classid')), 'value' => trim($_->textContent) };
	}
	
	$record->{'accessRightCode'} = trim($root->findvalue('(//bestaccessright/@classid)[1]'));
	$record->{'accessRightCode'} = 'UNKNOWN' unless ($record->{'accessRightCode'});

	$record->{'embargoEndDate'} = trim($root->findvalue('(//embargoenddate)[1]'));
	
	$record->{'type'} = trim($root->findvalue('(//resulttype/@classid)[1]'));
	$record->{'type'} = 'other' unless ($record->{'type'});
	
	$record->{'resourceType'} = trim($root->findvalue('(//resourcetype/@classid)[1]'));
	$record->{'resourceType'} = '0000' unless ($record->{'resourceType'});
	
	$record->{'url'} = trim($root->findvalue('(//webresource/url)[1]'));
	$record->{'collectedFromId'} = fixDsId(trim($root->findvalue('(//collectedfrom/@id)[1]')));
	$record->{'hostedById'} = fixDsId(trim($root->findvalue('(//hostedby/@id)[1]')));

	$record->{'contexts'} = [];
	foreach ($root->findnodes('//context')) {
		push @{$record->{'contexts'}}, trim($_->findvalue('@id'));
	}

	$record->{'linksToProjects'} = [];
	foreach ($root->findnodes('//rels/rel')) {
	if ($_->findvalue('./to/@type') eq 'project') {
		my $funder = uri_encode(trim($_->findvalue('.//funder/@shortname')));
		my $program = uri_encode(trim($_->findvalue('.//funding_level_0/@name')));
		my $code = uri_encode(trim($_->findvalue('./code')));
		my $jurisdiction = uri_encode(trim($_->findvalue('.//funder/@jurisdiction')));
		my $acronym = uri_encode(trim($_->findvalue('./acronym')));
		push @{$record->{'linksToProjects'}}, "info:eu-repo/grantAgreement/$funder/$program/$code/$jurisdiction//$acronym";
	}
	}

	# I reemove the empty fields
	foreach my $key (keys %{$record}) {
		delete $record->{$key} unless ($record->{$key});
	}
	
	my $date = trim($root->findvalue('//*[local-name()="dateOfCollection"]'));
	
	print endodeField($id);
	print "\t";
	print endodeField($record->{'type'});
	print "\t";
	print "INSERT";
	print "\t";
	print endodeField(encode_json($record));
	print "\t";
	print "mongo-migration";
	print "\t";
	print endodeField($date);
	print "\n";

}

sub endodeField {
	my ($s) = @_;

	$s =~ s/\t/	/g;
	$s =~ s/\n/	/g;

	return trim($s);
}

sub fixDsId {
	my ($id) = @_;
	
	if ($id eq 'opendoar____::358aee4cc897452c00244351e4d91f69') { return 'opendoar___::2659' }; # ZENODO
	if ($id eq 'openaire____::55045bd2a65019fd8e6741a755395c8c') { return 'openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18' }; # UNKNOWN
	if ($id eq 'openaire____::081b82f96300b6a6e3d282bad31cb6e2') { return 'openaire____::crossref' }; # Crossref
	if ($id eq 'openaire____::9e3be59865b2c1c335d32dae2fe7b254') { return 'openaire____::datacite' }; # Datacite
	
	return 'openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18';
}


1;
