/*
 * Decompiled with CFR 0.152.
 */
package eu.dnetlib.dhp.oa.graph.clean;

import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob;
import eu.dnetlib.dhp.oa.graph.clean.CleaningRuleMap;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Person;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.FileAttribute;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.FalseFileFilter;
import org.apache.commons.io.filefilter.IOFileFilter;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ExtendWith(value={MockitoExtension.class})
public class CleanGraphSparkJobTest {
    private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJobTest.class);
    public static final ObjectMapper MAPPER = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    @Mock
    private ISLookUpService isLookUpService;
    private VocabularyGroup vocabularies;
    private CleaningRuleMap mapping;
    private static SparkSession spark;
    private static Path testBaseTmpPath;
    private static String graphInputPath;
    private static String graphOutputPath;
    private static String dsMasterDuplicatePath;

    @BeforeAll
    public static void beforeAll() throws IOException, URISyntaxException {
        testBaseTmpPath = Files.createTempDirectory(CleanGraphSparkJobTest.class.getSimpleName(), new FileAttribute[0]);
        log.info("using test base path {}", (Object)testBaseTmpPath);
        File basePath = Paths.get(Objects.requireNonNull(CleanGraphSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/graph")).toURI()).toFile();
        List paths = FileUtils.listFilesAndDirs((File)basePath, (IOFileFilter)FalseFileFilter.FALSE, (IOFileFilter)TrueFileFilter.TRUE).stream().filter(f -> !f.getAbsolutePath().endsWith("/graph")).collect(Collectors.toList());
        for (File path : paths) {
            String type = StringUtils.substringAfterLast((String)path.getAbsolutePath(), (String)"/");
            FileUtils.copyDirectory((File)path, (File)testBaseTmpPath.resolve("input").resolve("graph").resolve(type).toFile());
        }
        FileUtils.copyFileToDirectory((File)Paths.get(CleanGraphSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json").toURI()).toFile(), (File)testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
        graphInputPath = testBaseTmpPath.resolve("input").resolve("graph").toString();
        graphOutputPath = testBaseTmpPath.resolve("output").resolve("graph").toString();
        dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
        SparkConf conf = new SparkConf();
        conf.setAppName(CleanGraphSparkJobTest.class.getSimpleName());
        conf.setMaster("local[*]");
        conf.set("spark.driver.host", "localhost");
        conf.set("hive.metastore.local", "true");
        conf.set("spark.ui.enabled", "false");
        conf.set("spark.sql.warehouse.dir", testBaseTmpPath.toString());
        conf.set("hive.metastore.warehouse.dir", testBaseTmpPath.resolve("warehouse").toString());
        spark = SparkSession.builder().config(conf).getOrCreate();
    }

    @BeforeEach
    public void setUp() throws ISLookUpException, IOException {
        Mockito.lenient().when((Object)this.isLookUpService.quickSearchProfile("for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') \nlet $vocid := $x//VOCABULARY_NAME/@code\nlet $vocname := $x//VOCABULARY_NAME/text()\nfor $term in ($x//TERM)\nreturn concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name)")).thenReturn(this.vocs());
        Mockito.lenient().when((Object)this.isLookUpService.quickSearchProfile("for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')\nlet $vocid := $x//VOCABULARY_NAME/@code\nlet $vocname := $x//VOCABULARY_NAME/text()\nfor $term in ($x//TERM)\nfor $syn in ($term//SYNONYM/@term)\nreturn concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)\n")).thenReturn(this.synonyms());
        this.vocabularies = VocabularyGroup.loadVocsFromIS((ISLookUpService)this.isLookUpService);
        this.mapping = CleaningRuleMap.create((VocabularyGroup)this.vocabularies);
    }

    @AfterAll
    public static void afterAll() throws IOException {
        FileUtils.deleteDirectory((File)testBaseTmpPath.toFile());
        spark.stop();
    }

    @Test
    void testCleanRelations() throws Exception {
        spark.read().textFile(graphInputPath.toString() + "/relation").map(CleanGraphSparkJobTest.as(Relation.class), Encoders.bean(Relation.class)).collectAsList().forEach(r -> Assertions.assertFalse((boolean)this.vocabularies.getTerms("dnet:relation_relClass").contains(r.getRelClass())));
        new CleanGraphSparkJob(this.args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", new String[]{"--inputPath", graphInputPath + "/relation", "--outputPath", graphOutputPath + "/relation", "--isLookupUrl", "lookupurl", "--graphTableClassName", Relation.class.getCanonicalName(), "--deepClean", "false", "--masterDuplicatePath", dsMasterDuplicatePath})).run(Boolean.valueOf(false), this.isLookUpService);
        spark.read().textFile(graphOutputPath.toString() + "/relation").map(CleanGraphSparkJobTest.as(Relation.class), Encoders.bean(Relation.class)).collectAsList().forEach(r -> {
            Assertions.assertTrue((boolean)this.vocabularies.getTerms("dnet:relation_relClass").contains(r.getRelClass()));
            Assertions.assertTrue((boolean)this.vocabularies.getTerms("dnet:relation_subRelType").contains(r.getSubRelType()));
            Assertions.assertEquals((Object)"iis", (Object)r.getDataInfo().getProvenanceaction().getClassid());
            Assertions.assertEquals((Object)"Inferred by OpenAIRE", (Object)r.getDataInfo().getProvenanceaction().getClassname());
        });
    }

    @Test
    void testFilter_invisible_true() throws Exception {
        Assertions.assertNotNull((Object)this.vocabularies);
        Assertions.assertNotNull((Object)this.mapping);
        String json = IOUtils.toString((InputStream)Objects.requireNonNull(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json")));
        Publication p_in = (Publication)MAPPER.readValue(json, Publication.class);
        Assertions.assertTrue((boolean)(p_in instanceof Result));
        Assertions.assertTrue((boolean)(p_in instanceof Publication));
        Assertions.assertEquals((Object)true, (Object)GraphCleaningFunctions.filter((Oaf)p_in));
    }

    @Test
    void testFilter_true_nothing_to_filter() throws Exception {
        Assertions.assertNotNull((Object)this.vocabularies);
        Assertions.assertNotNull((Object)this.mapping);
        String json = IOUtils.toString((InputStream)Objects.requireNonNull(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")));
        Publication p_in = (Publication)MAPPER.readValue(json, Publication.class);
        Assertions.assertTrue((boolean)(p_in instanceof Result));
        Assertions.assertTrue((boolean)(p_in instanceof Publication));
        Assertions.assertEquals((Object)true, (Object)GraphCleaningFunctions.filter((Oaf)p_in));
    }

    @Test
    void testFilter_missing_invisible() throws Exception {
        Assertions.assertNotNull((Object)this.vocabularies);
        Assertions.assertNotNull((Object)this.mapping);
        String json = IOUtils.toString((InputStream)Objects.requireNonNull(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json")));
        Publication p_in = (Publication)MAPPER.readValue(json, Publication.class);
        Assertions.assertTrue((boolean)(p_in instanceof Result));
        Assertions.assertTrue((boolean)(p_in instanceof Publication));
        Assertions.assertEquals((Object)true, (Object)GraphCleaningFunctions.filter((Oaf)p_in));
    }

    @Test
    void testCleaning_publication() throws Exception {
        String id = "50|CSC_________::2250a70c903c6ac6e4c01438259e9375";
        Publication p_in = (Publication)this.read(spark, graphInputPath + "/publication", Publication.class).filter(String.format("id = '%s'", "50|CSC_________::2250a70c903c6ac6e4c01438259e9375")).first();
        HashSet<String> invalidURLs = new HashSet<String>();
        invalidURLs.add("http://academia.edu/abcd");
        invalidURLs.add("http://repo.scoap3.org/api");
        invalidURLs.add("http://hdl.handle.net/");
        Assertions.assertNull((Object)p_in.getBestaccessright());
        Assertions.assertTrue((boolean)(p_in instanceof Result));
        Assertions.assertTrue((boolean)(p_in instanceof Publication));
        Assertions.assertNotNull((Object)p_in.getAuthor());
        Assertions.assertEquals((int)14, (int)p_in.getAuthor().size());
        Assertions.assertNotNull((Object)p_in.getInstance());
        Assertions.assertNotNull(p_in.getInstance().get(0));
        Assertions.assertEquals((long)3L, (long)((Instance)p_in.getInstance().get(0)).getUrl().stream().filter(invalidURLs::contains).count());
        new CleanGraphSparkJob(this.args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", new String[]{"--inputPath", graphInputPath + "/publication", "--outputPath", graphOutputPath + "/publication", "--isLookupUrl", "lookupurl", "--graphTableClassName", Publication.class.getCanonicalName(), "--deepClean", "false", "--masterDuplicatePath", dsMasterDuplicatePath})).run(Boolean.valueOf(false), this.isLookUpService);
        Publication p = (Publication)this.read(spark, graphOutputPath + "/publication", Publication.class).filter(String.format("id = '%s'", "50|CSC_________::2250a70c903c6ac6e4c01438259e9375")).first();
        Assertions.assertNull((Object)p.getPublisher());
        Assertions.assertNotNull((Object)p.getAuthor());
        Assertions.assertEquals((int)12, (int)p.getAuthor().size());
        Assertions.assertEquals((Object)"und", (Object)p.getLanguage().getClassid());
        Assertions.assertEquals((Object)"Undetermined", (Object)p.getLanguage().getClassname());
        Assertions.assertEquals((Object)"DE", (Object)((Country)p.getCountry().get(0)).getClassid());
        Assertions.assertEquals((Object)"Germany", (Object)((Country)p.getCountry().get(0)).getClassname());
        Assertions.assertEquals((Object)"0018", (Object)((Instance)p.getInstance().get(0)).getInstancetype().getClassid());
        Assertions.assertEquals((Object)"Annotation", (Object)((Instance)p.getInstance().get(0)).getInstancetype().getClassname());
        Assertions.assertEquals((Object)"0027", (Object)((Instance)p.getInstance().get(1)).getInstancetype().getClassid());
        Assertions.assertEquals((Object)"Model", (Object)((Instance)p.getInstance().get(1)).getInstancetype().getClassname());
        Assertions.assertEquals((Object)"0038", (Object)((Instance)p.getInstance().get(2)).getInstancetype().getClassid());
        Assertions.assertEquals((Object)"Other literature type", (Object)((Instance)p.getInstance().get(2)).getInstancetype().getClassname());
        Assertions.assertEquals((Object)"CLOSED", (Object)((Instance)p.getInstance().get(0)).getAccessright().getClassid());
        Assertions.assertEquals((Object)"Closed Access", (Object)((Instance)p.getInstance().get(0)).getAccessright().getClassname());
        Set pidTerms = this.vocabularies.getTerms("dnet:pid_types");
        Assertions.assertTrue((boolean)p.getPid().stream().map(StructuredProperty::getQualifier).allMatch(q -> pidTerms.contains(q.getClassid())));
        List poi = p.getInstance();
        Assertions.assertNotNull((Object)poi);
        Assertions.assertEquals((int)3, (int)poi.size());
        Instance poii = (Instance)poi.get(0);
        Assertions.assertNotNull((Object)poii);
        Assertions.assertNotNull((Object)poii.getPid());
        Assertions.assertEquals((int)2, (int)poii.getPid().size());
        Assertions.assertTrue((boolean)poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
        Assertions.assertTrue((boolean)poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
        Assertions.assertNotNull((Object)poii.getAlternateIdentifier());
        Assertions.assertEquals((int)1, (int)poii.getAlternateIdentifier().size());
        Assertions.assertTrue((boolean)poii.getAlternateIdentifier().stream().anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
        Assertions.assertEquals((int)3, (int)p.getTitle().size());
        List titles = p.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList());
        Assertions.assertTrue((boolean)titles.contains("omic"));
        Assertions.assertTrue((boolean)titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
        Assertions.assertTrue((boolean)titles.contains("\uff62\u30de\u30ad\u30e3\u30d9\u30ea\u7684\u77e5\u6027\u3068\u5fc3\u306e\u7406\u8ad6\u306e\u9032\u5316\u8ad6\uff63 \u30ea\u30c1\u30e3\u30fc\u30c9\u30fb\u30d0\u30fc\u30f3\uff0c \u30a2\u30f3\u30c9\u30ea\u30e5\u30fc\u30fb\u30db\u30ef\u30a4\u30c8\u30a5\u30f3 \u7de8\uff0f\u85e4\u7530\u548c\u751f\uff0c \u5c71\u4e0b\u535a\u5fd7\uff0c \u53cb\u6c38\u96c5\u5df3 \u76e3\u8a33"));
        Assertions.assertEquals((Object)"CLOSED", (Object)p.getBestaccessright().getClassid());
        Assertions.assertNull((Object)p.getPublisher());
        Assertions.assertEquals((Object)"1970-10-07", (Object)p.getDateofacceptance().getValue());
        Assertions.assertEquals((Object)"0038", (Object)((Instance)p.getInstance().get(2)).getInstancetype().getClassid());
        Assertions.assertEquals((Object)"Other literature type", (Object)((Instance)p.getInstance().get(2)).getInstancetype().getClassname());
        List pci = p.getInstance();
        Assertions.assertNotNull((Object)pci);
        Assertions.assertEquals((int)3, (int)pci.size());
        Instance pcii = (Instance)pci.get(0);
        Assertions.assertNotNull((Object)pcii);
        Assertions.assertNotNull((Object)pcii.getPid());
        Assertions.assertEquals((int)2, (int)pcii.getPid().size());
        Assertions.assertTrue((boolean)pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
        Assertions.assertTrue((boolean)pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
        Assertions.assertNotNull((Object)pcii.getAlternateIdentifier());
        Assertions.assertEquals((int)1, (int)pcii.getAlternateIdentifier().size());
        Assertions.assertTrue((boolean)pcii.getAlternateIdentifier().stream().anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
        Assertions.assertTrue((boolean)((Instance)p.getInstance().get(0)).getUrl().stream().noneMatch(invalidURLs::contains));
        Assertions.assertNotNull((Object)p.getSubject());
        List fos_subjects = p.getSubject().stream().filter(s -> "FOS".equals(s.getQualifier().getClassid())).collect(Collectors.toList());
        Assertions.assertNotNull(fos_subjects);
        Assertions.assertEquals((int)3, (int)fos_subjects.size());
        Assertions.assertTrue((boolean)fos_subjects.stream().anyMatch(s -> "0101 mathematics".equals(s.getValue()) & "FOS".equals(s.getQualifier().getClassid()) & "subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
        CleanGraphSparkJobTest.verify_keyword(p, "FOS: Mathematics");
        CleanGraphSparkJobTest.verify_keyword(p, "FOS: Computer and information sciences");
    }

    @Test
    void testCleaning_person() throws Exception {
        String id = "30|orcid_______::00015c0a74cb5d74682274237a57c505";
        Person p_in = (Person)this.read(spark, graphInputPath + "/person", Person.class).filter(String.format("id = '%s'", "30|orcid_______::00015c0a74cb5d74682274237a57c505")).first();
        Assertions.assertTrue((boolean)(p_in instanceof Person));
        Assertions.assertEquals((Object)"30|orcid_______::00015c0a74cb5d74682274237a57c505", (Object)p_in.getId());
        Assertions.assertEquals((Object)"CLAUDIO", (Object)p_in.getGivenName());
        Assertions.assertEquals((Object)"CIAVATTA", (Object)p_in.getFamilyName());
        Assertions.assertNotNull((Object)p_in.getPid());
        Assertions.assertEquals((int)2, (int)p_in.getPid().size());
        Optional<StructuredProperty> orcid = p_in.getPid().stream().filter(pid -> "orcid".equals(pid.getQualifier().getClassid())).findFirst();
        Assertions.assertTrue((boolean)orcid.isPresent());
        Assertions.assertEquals((Object)"orcid", (Object)orcid.get().getQualifier().getClassid());
        Assertions.assertEquals((Object)"0000-0002-7914-4394", (Object)orcid.get().getValue());
        new CleanGraphSparkJob(this.args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", new String[]{"--inputPath", graphInputPath + "/person", "--outputPath", graphOutputPath + "/person", "--isLookupUrl", "lookupurl", "--graphTableClassName", Person.class.getCanonicalName(), "--deepClean", "false", "--masterDuplicatePath", dsMasterDuplicatePath})).run(Boolean.valueOf(false), this.isLookUpService);
        Person p = (Person)this.read(spark, graphOutputPath + "/person", Person.class).first();
        Assertions.assertEquals((Object)"30|orcid_______::00015c0a74cb5d74682274237a57c505", (Object)p.getId());
        orcid = p.getPid().stream().filter(pid -> "orcid".equals(pid.getQualifier().getClassid())).findFirst();
        Assertions.assertTrue((boolean)orcid.isPresent());
        Assertions.assertEquals((Object)"orcid", (Object)orcid.get().getQualifier().getClassid());
        Assertions.assertEquals((Object)"0000-0002-7914-4394", (Object)orcid.get().getValue());
    }

    @Test
    void testCleanDoiBoost() throws IOException, ParseException, ISLookUpException, ClassNotFoundException {
        this.verifyFiltering(1, "50|doi_________::b0baa0eb88a5788f0b8815560d2a32f2");
    }

    @Test
    void testCleanDoiBoost2() throws IOException, ParseException, ISLookUpException, ClassNotFoundException {
        this.verifyFiltering(1, "50|doi_________::4972b0ca81b96b225aed8038bb965656");
    }

    private void verifyFiltering(int expectedCount, String id) throws ISLookUpException, ClassNotFoundException, IOException, ParseException {
        new CleanGraphSparkJob(this.args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", new String[]{"--inputPath", graphInputPath + "/publication", "--outputPath", graphOutputPath + "/publication", "--isLookupUrl", "lookupurl", "--graphTableClassName", Publication.class.getCanonicalName(), "--deepClean", "false", "--masterDuplicatePath", dsMasterDuplicatePath})).run(Boolean.valueOf(false), this.isLookUpService);
        Dataset p = this.read(spark, graphOutputPath + "/publication", Publication.class).filter(String.format("id = '%s'", id));
        Assertions.assertEquals((long)expectedCount, (long)p.count());
    }

    @Test
    void testCleanContext() throws Exception {
        String prefix = "gcube ";
        new CleanGraphSparkJob(this.args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", new String[]{"--inputPath", graphInputPath + "/publication", "--outputPath", graphOutputPath + "/publication", "--isLookupUrl", "lookupurl", "--graphTableClassName", Publication.class.getCanonicalName(), "--deepClean", "true", "--contextId", "sobigdata", "--verifyParam", "gCube ", "--masterDuplicatePath", dsMasterDuplicatePath, "--country", "NL", "--verifyCountryParam", "10.17632", "--collectedfrom", "NARCIS", "--hostedBy", Objects.requireNonNull(this.getClass().getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")).getPath()})).run(Boolean.valueOf(false), this.isLookUpService);
        Dataset pubs = this.read(spark, graphOutputPath + "/publication", Publication.class).filter((FilterFunction & Serializable)p1 -> StringUtils.endsWith((CharSequence)p1.getId(), (CharSequence)"_ctx"));
        Assertions.assertEquals((long)7L, (long)pubs.count());
        Assertions.assertEquals((int)0, (int)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439a_ctx")).first()).getContext().size());
        Assertions.assertEquals((int)1, (int)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67d_ctx")).first()).getContext().size());
        Assertions.assertEquals((Object)"sobigdata::projects::2", (Object)((Context)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67d_ctx")).first()).getContext().get(0)).getId());
        Assertions.assertEquals((int)1, (int)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx")).first()).getContext().size());
        Assertions.assertEquals((Object)"sobigdata::projects::2", (Object)((Context)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx")).first()).getContext().get(0)).getId());
        List titles = ((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6f_ctx")).first()).getTitle();
        Assertions.assertEquals((int)1, (int)titles.size());
        Assertions.assertTrue((boolean)((StructuredProperty)titles.get(0)).getValue().toLowerCase().startsWith("gcube "));
        Assertions.assertEquals((Object)"subtitle", (Object)((StructuredProperty)titles.get(0)).getQualifier().getClassid());
        Assertions.assertEquals((int)1, (int)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx")).first()).getContext().size());
        Assertions.assertEquals((Object)"sobigdata::projects::1", (Object)((Context)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx")).first()).getContext().get(0)).getId());
        titles = ((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9_ctx")).first()).getTitle();
        Assertions.assertEquals((int)1, (int)titles.size());
        Assertions.assertFalse((boolean)((StructuredProperty)titles.get(0)).getValue().toLowerCase().startsWith("gcube "));
        Assertions.assertTrue((boolean)((StructuredProperty)titles.get(0)).getValue().toLowerCase().contains("gcube ".trim()));
        Assertions.assertEquals((Object)"main title", (Object)((StructuredProperty)titles.get(0)).getQualifier().getClassid());
        Assertions.assertEquals((int)1, (int)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx")).first()).getContext().size());
        Assertions.assertEquals((Object)"dh-ch", (Object)((Context)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx")).first()).getContext().get(0)).getId());
        titles = ((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fd_ctx")).first()).getTitle();
        Assertions.assertEquals((int)1, (int)titles.size());
        Assertions.assertTrue((boolean)((StructuredProperty)titles.get(0)).getValue().toLowerCase().startsWith("gcube "));
        Assertions.assertEquals((Object)"main title", (Object)((StructuredProperty)titles.get(0)).getQualifier().getClassid());
        Assertions.assertEquals((int)1, (int)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx")).first()).getContext().size());
        Assertions.assertEquals((Object)"dh-ch", (Object)((Context)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx")).first()).getContext().get(0)).getId());
        titles = ((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74_ctx")).first()).getTitle();
        Assertions.assertEquals((int)2, (int)titles.size());
        Assertions.assertTrue((boolean)titles.stream().anyMatch(t -> t.getQualifier().getClassid().equals("main title") && t.getValue().toLowerCase().startsWith("gcube ")));
        Assertions.assertEquals((int)1, (int)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx")).first()).getContext().size());
        Assertions.assertEquals((Object)"dh-ch", (Object)((Context)((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx")).first()).getContext().get(0)).getId());
        titles = ((Publication)pubs.filter((FilterFunction & Serializable)p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6_ctx")).first()).getTitle();
        Assertions.assertEquals((int)2, (int)titles.size());
        Assertions.assertTrue((boolean)titles.stream().anyMatch(t -> t.getQualifier().getClassid().equals("main title") && t.getValue().toLowerCase().startsWith("gcube ")));
    }

    @Test
    void testClean_ORP() throws Exception {
        String prefix = "gcube ";
        new CleanGraphSparkJob(this.args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", new String[]{"--inputPath", graphInputPath + "/orp", "--outputPath", graphOutputPath + "/orp", "--isLookupUrl", "lookupurl", "--graphTableClassName", OtherResearchProduct.class.getCanonicalName(), "--deepClean", "true", "--contextId", "sobigdata", "--verifyParam", "gCube ", "--masterDuplicatePath", dsMasterDuplicatePath, "--country", "NL", "--verifyCountryParam", "10.17632", "--collectedfrom", "NARCIS", "--hostedBy", Objects.requireNonNull(this.getClass().getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")).getPath()})).run(Boolean.valueOf(false), this.isLookUpService);
        Dataset<OtherResearchProduct> orp = this.read(spark, graphOutputPath + "/orp", OtherResearchProduct.class);
        Assertions.assertEquals((long)1L, (long)orp.count());
    }

    @Test
    void testCleanCfHbSparkJob() throws Exception {
        Dataset<Publication> pubs_in = this.read(spark, graphInputPath + "/publication", Publication.class);
        Publication p1_in = (Publication)pubs_in.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13b_cfhb'").first();
        Assertions.assertEquals((Object)"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", (Object)((KeyValue)p1_in.getCollectedfrom().get(0)).getKey());
        Assertions.assertEquals((Object)"Bacterial Protein Interaction Database - DUP", (Object)((KeyValue)p1_in.getCollectedfrom().get(0)).getValue());
        Assertions.assertEquals((Object)"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", (Object)((Instance)p1_in.getInstance().get(0)).getCollectedfrom().getKey());
        Assertions.assertEquals((Object)"Bacterial Protein Interaction Database - DUP", (Object)((Instance)p1_in.getInstance().get(0)).getCollectedfrom().getValue());
        Publication p2_in = (Publication)pubs_in.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3a_cfhb'").first();
        Assertions.assertEquals((Object)"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", (Object)((KeyValue)p2_in.getCollectedfrom().get(0)).getKey());
        Assertions.assertEquals((Object)"FILUR DATA - DUP", (Object)((KeyValue)p2_in.getCollectedfrom().get(0)).getValue());
        Assertions.assertEquals((Object)"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", (Object)((Instance)p2_in.getInstance().get(0)).getCollectedfrom().getKey());
        Assertions.assertEquals((Object)"FILUR DATA - DUP", (Object)((Instance)p2_in.getInstance().get(0)).getCollectedfrom().getValue());
        Assertions.assertEquals((Object)"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", (Object)((Instance)p2_in.getInstance().get(0)).getHostedby().getKey());
        Assertions.assertEquals((Object)"depositar - DUP", (Object)((Instance)p2_in.getInstance().get(0)).getHostedby().getValue());
        Publication p3_in = (Publication)pubs_in.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7d_cfhb'").first();
        Assertions.assertEquals((Object)"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", (Object)((KeyValue)p3_in.getCollectedfrom().get(0)).getKey());
        Assertions.assertEquals((Object)"DANS (Data Archiving and Networked Services)", (Object)((KeyValue)p3_in.getCollectedfrom().get(0)).getValue());
        Assertions.assertEquals((Object)"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", (Object)((Instance)p3_in.getInstance().get(0)).getCollectedfrom().getKey());
        Assertions.assertEquals((Object)"DANS (Data Archiving and Networked Services)", (Object)((Instance)p3_in.getInstance().get(0)).getCollectedfrom().getValue());
        Assertions.assertEquals((Object)"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", (Object)((Instance)p3_in.getInstance().get(0)).getHostedby().getKey());
        Assertions.assertEquals((Object)"DANS (Data Archiving and Networked Services)", (Object)((Instance)p3_in.getInstance().get(0)).getHostedby().getValue());
        new CleanGraphSparkJob(this.args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", new String[]{"--inputPath", graphInputPath + "/publication", "--outputPath", graphOutputPath + "/publication", "--isLookupUrl", "lookupurl", "--graphTableClassName", Publication.class.getCanonicalName(), "--deepClean", "true", "--contextId", "sobigdata", "--verifyParam", "gCube ", "--masterDuplicatePath", dsMasterDuplicatePath, "--country", "NL", "--verifyCountryParam", "10.17632", "--collectedfrom", "NARCIS", "--hostedBy", Objects.requireNonNull(this.getClass().getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")).getPath()})).run(Boolean.valueOf(false), this.isLookUpService);
        Assertions.assertTrue((boolean)Files.exists(Paths.get(graphOutputPath, "publication"), new LinkOption[0]));
        Dataset pubs_out = this.read(spark, graphOutputPath + "/publication", Publication.class).filter((FilterFunction & Serializable)p -> StringUtils.endsWith((CharSequence)p.getId(), (CharSequence)"_cfhb"));
        Assertions.assertEquals((long)3L, (long)pubs_out.count());
        Publication p1_out = (Publication)pubs_out.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13b_cfhb'").first();
        Assertions.assertEquals((Object)"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", (Object)((KeyValue)p1_out.getCollectedfrom().get(0)).getKey());
        Assertions.assertEquals((Object)"Bacterial Protein Interaction Database", (Object)((KeyValue)p1_out.getCollectedfrom().get(0)).getValue());
        Assertions.assertEquals((Object)"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", (Object)((Instance)p1_out.getInstance().get(0)).getCollectedfrom().getKey());
        Assertions.assertEquals((Object)"Bacterial Protein Interaction Database", (Object)((Instance)p1_out.getInstance().get(0)).getCollectedfrom().getValue());
        Publication p2_out = (Publication)pubs_out.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3a_cfhb'").first();
        Assertions.assertEquals((Object)"10|re3data_____::fc1db64b3964826913b1e9eafe830490", (Object)((KeyValue)p2_out.getCollectedfrom().get(0)).getKey());
        Assertions.assertEquals((Object)"FULIR Data", (Object)((KeyValue)p2_out.getCollectedfrom().get(0)).getValue());
        Assertions.assertEquals((Object)"10|re3data_____::fc1db64b3964826913b1e9eafe830490", (Object)((Instance)p2_out.getInstance().get(0)).getCollectedfrom().getKey());
        Assertions.assertEquals((Object)"FULIR Data", (Object)((Instance)p2_out.getInstance().get(0)).getCollectedfrom().getValue());
        Assertions.assertEquals((Object)"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", (Object)((Instance)p2_out.getInstance().get(0)).getHostedby().getKey());
        Assertions.assertEquals((Object)"depositar", (Object)((Instance)p2_out.getInstance().get(0)).getHostedby().getValue());
        Publication p3_out = (Publication)pubs_out.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7d_cfhb'").first();
        Assertions.assertEquals((Object)"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", (Object)((KeyValue)p3_out.getCollectedfrom().get(0)).getKey());
        Assertions.assertEquals((Object)"DANS (Data Archiving and Networked Services)", (Object)((KeyValue)p3_out.getCollectedfrom().get(0)).getValue());
        Assertions.assertEquals((Object)"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", (Object)((Instance)p3_out.getInstance().get(0)).getCollectedfrom().getKey());
        Assertions.assertEquals((Object)"DANS (Data Archiving and Networked Services)", (Object)((Instance)p3_out.getInstance().get(0)).getCollectedfrom().getValue());
        Assertions.assertEquals((Object)"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", (Object)((Instance)p3_out.getInstance().get(0)).getHostedby().getKey());
        Assertions.assertEquals((Object)"DANS (Data Archiving and Networked Services)", (Object)((Instance)p3_out.getInstance().get(0)).getHostedby().getValue());
    }

    @Test
    void testCleanCountry() throws Exception {
        new CleanGraphSparkJob(this.args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", new String[]{"--inputPath", graphInputPath + "/publication", "--outputPath", graphOutputPath + "/publication", "--isLookupUrl", "lookupurl", "--graphTableClassName", Publication.class.getCanonicalName(), "--deepClean", "true", "--contextId", "sobigdata", "--verifyParam", "gCube ", "--masterDuplicatePath", dsMasterDuplicatePath, "--country", "NL", "--verifyCountryParam", "10.17632", "--collectedfrom", "NARCIS", "--hostedBy", Objects.requireNonNull(this.getClass().getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")).getPath()})).run(Boolean.valueOf(false), this.isLookUpService);
        Dataset pubs_out = this.read(spark, graphOutputPath + "/publication", Publication.class).filter((FilterFunction & Serializable)p -> StringUtils.endsWith((CharSequence)p.getId(), (CharSequence)"_country"));
        Assertions.assertEquals((long)8L, (long)pubs_out.count());
        Assertions.assertEquals((int)1, (int)((Publication)pubs_out.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6_country")).first()).getCountry().size());
        Assertions.assertEquals((int)1, (int)((Publication)pubs_out.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1_country")).first()).getCountry().size());
        Assertions.assertEquals((int)1, (int)((Publication)pubs_out.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817e_country")).first()).getCountry().size());
        Assertions.assertEquals((int)0, (int)((Publication)pubs_out.filter((FilterFunction & Serializable)p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817d_country")).first()).getCountry().size());
    }

    private List<String> vocs() throws IOException {
        return IOUtils.readLines((InputStream)Objects.requireNonNull(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")));
    }

    private List<String> synonyms() throws IOException {
        return IOUtils.readLines((InputStream)Objects.requireNonNull(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")));
    }

    private <R> Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
        return spark.read().textFile(path).map(CleanGraphSparkJobTest.as(clazz), Encoders.bean(clazz));
    }

    private static <R> MapFunction<String, R> as(Class<R> clazz) {
        return (MapFunction & Serializable)s -> MAPPER.readValue(s, clazz);
    }

    private static String classPathResourceAsString(String path) throws IOException {
        return IOUtils.toString((InputStream)Objects.requireNonNull(CleanGraphSparkJobTest.class.getResourceAsStream(path)));
    }

    private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
        ArgumentApplicationParser parser = new ArgumentApplicationParser(CleanGraphSparkJobTest.classPathResourceAsString(paramSpecs));
        parser.parseArgument(args);
        return parser;
    }

    private static void verify_keyword(Publication p_cleaned, String subject) {
        Optional<Subject> s1 = p_cleaned.getSubject().stream().filter(s -> s.getValue().equals(subject)).findFirst();
        Assertions.assertTrue((boolean)s1.isPresent());
        Assertions.assertEquals((Object)"keyword", (Object)s1.get().getQualifier().getClassid());
        Assertions.assertEquals((Object)"keyword", (Object)s1.get().getQualifier().getClassname());
    }
}

