#!/bin/bash
# Author: Wolfgang Sauer.
# See https://support.d4science.org/issues/8626#change-58157

# this has to be changed ++++++++++++++++++++++++
WORK_DIR=/var/lib/ariadneplus-data/clarin

# fix paths ++++++++++++++++++++++++++++++++++++++
HARVESTER_URL=https://vlo.clarin.eu/resultsets
RESULTSETS='clarin.tar.bz2 others.tar.bz2'
CMDI_PATH=results/cmdi
# end fix paths +++++++++++++++++++++++++++++++++

if [ -e $WORK_DIR/results ]; then
	echo "removing old results..."
	rm -rf $WORK_DIR/results
fi

cd $WORK_DIR

#download harvested records, unpack, clean 
for RESULTSET in $RESULTSETS; do
	#download tar
	wget $HARVESTER_URL/$RESULTSET
	
	echo "unpacking $RESULTSET..."
	#unpack CMDI 1.2 files
	tar xjf $RESULTSET $CMDI_PATH
	
	#delete tar
	rm $RESULTSET
done

echo "correcting hdl string and reorganizing files by profile id..."

find $WORK_DIR/$CMDI_PATH -name '*.xml'| while read -r file; do
	echo "treating file $file"
# add http to hdl string
	sed -i 's/hdl:/http:\/\/hdl.handle.net\//g' $file

# organizing files by profile id
	profileID=$(grep -oP '<cmd:MdProfile>.+p_[0-9]+</cmd:MdProfile>' $file|grep -oE p_[0-9]+)
	if [ -n "$profileID" ]; then
		outDir=$WORK_DIR/$CMDI_PATH/$profileID
	else
		outDir=$WORK_DIR/$CMDI_PATH/noProfileID
	fi

	if [ ! -e $outDir ]; then
		mkdir $outDir
	fi

	mv $file $outDir
done

echo "removing empty directories..."
rmdir --ignore-fail-on-non-empty $WORK_DIR/$CMDI_PATH

echo "finished!"