refactoring

2020-04-27 10:57:50 +02:00 · 2020-04-27 10:57:50 +02:00 · 6135096ef1
parent d30e710165
commit 6135096ef1
3 changed files with 517 additions and 260 deletions
--- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java
+++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java
@ -1,10 +1,13 @@
 package eu.dnetlib.dhp.countrypropagation;
-import com.google.gson.Gson;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.Country;
 import eu.dnetlib.dhp.schema.oaf.Datasource;
 import eu.dnetlib.dhp.schema.oaf.Software;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
@ -16,18 +19,8 @@ import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import scala.Array;
 import scala.Tuple2;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;
 public class CountryPropagationJobTest {
    private static final Logger log = LoggerFactory.getLogger(CountryPropagationJobTest.class);
@ -54,13 +47,11 @@ public class CountryPropagationJobTest {
        conf.set("spark.sql.warehouse.dir", workingDir.toString());
        conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
-        spark = SparkSession
+        spark =
-                .builder()
+                SparkSession.builder()
-                .appName(CountryPropagationJobTest.class.getSimpleName())
+                        .appName(CountryPropagationJobTest.class.getSimpleName())
-                .config(conf)
+                        .config(conf)
-                .getOrCreate();
+                        .getOrCreate();
    }
    @AfterAll
@ -69,95 +60,190 @@ public class CountryPropagationJobTest {
        spark.stop();
    }
-   @Test
+    @Test
    public void testCountryPropagationSoftware() throws Exception {
-        SparkCountryPropagationJob2.main(new String[]{
+        SparkCountryPropagationJob2.main(
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+                new String[] {
-                "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/countrypropagation/sample/software").getPath(),
+                    "-isSparkSessionManaged",
-                "-hive_metastore_uris", "",
+                    Boolean.FALSE.toString(),
-            "-writeUpdate","false",
+                    "-sourcePath",
-            "-saveGraph","true",
+                    getClass()
-            "-resultTableName","eu.dnetlib.dhp.schema.oaf.Software",
+                            .getResource("/eu/dnetlib/dhp/countrypropagation/sample/software")
-            "-outputPath",workingDir.toString() + "/software",
+                            .getPath(),
-            "-preparedInfoPath", getClass().getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo").getPath(),
+                    "-hive_metastore_uris",
-        });
+                    "",
                    "-saveGraph",
                    "true",
                    "-resultTableName",
                    "eu.dnetlib.dhp.schema.oaf.Software",
                    "-outputPath",
                    workingDir.toString() + "/software",
                    "-preparedInfoPath",
                    getClass()
                            .getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo")
                            .getPath(),
                });
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-        JavaRDD<Software> tmp = sc.textFile(workingDir.toString()+"/software")
+        JavaRDD<Software> tmp =
-                .map(item -> OBJECT_MAPPER.readValue(item, Software.class));
+                sc.textFile(workingDir.toString() + "/software")
                        .map(item -> OBJECT_MAPPER.readValue(item, Software.class));
-        //tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s));
+        // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s));
        Assertions.assertEquals(10, tmp.count());
-       Dataset<Software> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Software.class));
+        Dataset<Software> verificationDs =
                spark.createDataset(tmp.rdd(), Encoders.bean(Software.class));
-       Assertions.assertEquals(6, verificationDs.filter("size(country) > 0").count());
+        Assertions.assertEquals(6, verificationDs.filter("size(country) > 0").count());
-       Assertions.assertEquals(3, verificationDs.filter("size(country) = 1").count());
+        Assertions.assertEquals(3, verificationDs.filter("size(country) = 1").count());
-       Assertions.assertEquals(3, verificationDs.filter("size(country) = 2").count());
+        Assertions.assertEquals(3, verificationDs.filter("size(country) = 2").count());
-       Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count());
+        Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count());
        Dataset<String> countryExploded =
                verificationDs
                        .flatMap(row -> row.getCountry().iterator(), Encoders.bean(Country.class))
                        .map(c -> c.getClassid(), Encoders.STRING());
-       Dataset<String> countryExploded = verificationDs
+        Assertions.assertEquals(9, countryExploded.count());
               .flatMap(row -> row.getCountry().iterator(), Encoders.bean(Country.class))
               .map(c -> c.getClassid(), Encoders.STRING());
-       Assertions.assertEquals(9, countryExploded.count());
+        Assertions.assertEquals(1, countryExploded.filter("value = 'FR'").count());
        Assertions.assertEquals(1, countryExploded.filter("value = 'TR'").count());
        Assertions.assertEquals(2, countryExploded.filter("value = 'IT'").count());
        Assertions.assertEquals(1, countryExploded.filter("value = 'US'").count());
        Assertions.assertEquals(1, countryExploded.filter("value = 'MX'").count());
        Assertions.assertEquals(1, countryExploded.filter("value = 'CH'").count());
        Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count());
-       Assertions.assertEquals(1, countryExploded.filter("value = 'FR'").count());
+        Dataset<Tuple2<String, String>> countryExplodedWithCountryclassid =
-       Assertions.assertEquals(1, countryExploded.filter("value = 'TR'").count());
+                verificationDs.flatMap(
-       Assertions.assertEquals(2, countryExploded.filter("value = 'IT'").count());
+                        row -> {
-       Assertions.assertEquals(1, countryExploded.filter("value = 'US'").count());
+                            List<Tuple2<String, String>> prova = new ArrayList();
-       Assertions.assertEquals(1, countryExploded.filter("value = 'MX'").count());
+                            List<Country> country_list = row.getCountry();
-       Assertions.assertEquals(1, countryExploded.filter("value = 'CH'").count());
+                            country_list.stream()
-       Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count());
+                                    .forEach(
                                            c ->
                                                    prova.add(
                                                            new Tuple2<>(
                                                                    row.getId(), c.getClassid())));
                            return prova.iterator();
                        },
                        Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
-       Dataset<Tuple2<String, String>> countryExplodedWithCountryclassid = verificationDs
+        Assertions.assertEquals(9, countryExplodedWithCountryclassid.count());
               .flatMap(row -> {
                   List<Tuple2<String, String>> prova = new ArrayList();
                   List<Country> country_list = row.getCountry();
                   country_list.stream().forEach(c -> prova.add(new Tuple2<>(row.getId(), c.getClassid())));
                   return prova.iterator();
               }, Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
-       Assertions.assertEquals(9, countryExplodedWithCountryclassid.count());
+        countryExplodedWithCountryclassid.show(false);
        Assertions.assertEquals(
                1,
                countryExplodedWithCountryclassid
                        .filter(
                                "_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'FR' ")
                        .count());
        Assertions.assertEquals(
                1,
                countryExplodedWithCountryclassid
                        .filter(
                                "_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'TR' ")
                        .count());
        Assertions.assertEquals(
                2,
                countryExplodedWithCountryclassid
                        .filter(
                                "_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'IT' or _2 = 'MX') ")
                        .count());
        Assertions.assertEquals(
                2,
                countryExplodedWithCountryclassid
                        .filter(
                                "_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'IT' or _2 = 'US') ")
                        .count());
        Assertions.assertEquals(
                1,
                countryExplodedWithCountryclassid
                        .filter(
                                "_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'JP'")
                        .count());
        Assertions.assertEquals(
                2,
                countryExplodedWithCountryclassid
                        .filter(
                                "_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'CH' or _2 = 'JP') ")
                        .count());
-       countryExplodedWithCountryclassid.show(false);
+        Dataset<Tuple2<String, String>> countryExplodedWithCountryclassname =
-       Assertions.assertEquals(1, countryExplodedWithCountryclassid.filter("_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'FR' ").count());
+                verificationDs.flatMap(
-       Assertions.assertEquals(1, countryExplodedWithCountryclassid.filter("_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'TR' ").count());
+                        row -> {
-       Assertions.assertEquals(2, countryExplodedWithCountryclassid.filter("_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'IT' or _2 = 'MX') ").count());
+                            List<Tuple2<String, String>> prova = new ArrayList();
-       Assertions.assertEquals(2, countryExplodedWithCountryclassid.filter("_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'IT' or _2 = 'US') ").count());
+                            List<Country> country_list = row.getCountry();
-       Assertions.assertEquals(1, countryExplodedWithCountryclassid.filter("_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'JP'").count());
+                            country_list.stream()
-       Assertions.assertEquals(2, countryExplodedWithCountryclassid.filter("_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'CH' or _2 = 'JP') ").count());
+                                    .forEach(
                                            c ->
                                                    prova.add(
                                                            new Tuple2<>(
                                                                    row.getId(),
                                                                    c.getClassname())));
                            return prova.iterator();
                        },
                        Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
-       Dataset<Tuple2<String, String>> countryExplodedWithCountryclassname = verificationDs
+        countryExplodedWithCountryclassname.show(false);
-               .flatMap(row -> {
+        Assertions.assertEquals(
-                   List<Tuple2<String, String>> prova = new ArrayList();
+                1,
-                   List<Country> country_list = row.getCountry();
+                countryExplodedWithCountryclassname
-                   country_list.stream().forEach(c -> prova.add(new Tuple2<>(row.getId(), c.getClassname())));
+                        .filter(
-                   return prova.iterator();
+                                "_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'France' ")
-               }, Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
+                        .count());
        Assertions.assertEquals(
                1,
                countryExplodedWithCountryclassname
                        .filter(
                                "_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'Turkey' ")
                        .count());
        Assertions.assertEquals(
                2,
                countryExplodedWithCountryclassname
                        .filter(
                                "_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'Italy' or _2 = 'Mexico') ")
                        .count());
        Assertions.assertEquals(
                2,
                countryExplodedWithCountryclassname
                        .filter(
                                "_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'Italy' or _2 = 'United States') ")
                        .count());
        Assertions.assertEquals(
                1,
                countryExplodedWithCountryclassname
                        .filter(
                                "_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'Japan' ")
                        .count());
        Assertions.assertEquals(
                2,
                countryExplodedWithCountryclassname
                        .filter(
                                "_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'Switzerland' or _2 = 'Japan') ")
                        .count());
-       countryExplodedWithCountryclassname.show(false);
+        Dataset<Tuple2<String, String>> countryExplodedWithCountryProvenance =
-       Assertions.assertEquals(1, countryExplodedWithCountryclassname.filter("_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'France' ").count());
+                verificationDs.flatMap(
-       Assertions.assertEquals(1, countryExplodedWithCountryclassname.filter("_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'Turkey' ").count());
+                        row -> {
-       Assertions.assertEquals(2, countryExplodedWithCountryclassname.filter("_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'Italy' or _2 = 'Mexico') ").count());
+                            List<Tuple2<String, String>> prova = new ArrayList();
-       Assertions.assertEquals(2, countryExplodedWithCountryclassname.filter("_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'Italy' or _2 = 'United States') ").count());
+                            List<Country> country_list = row.getCountry();
-       Assertions.assertEquals(1, countryExplodedWithCountryclassname.filter("_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'Japan' ").count());
+                            country_list.stream()
-       Assertions.assertEquals(2, countryExplodedWithCountryclassname.filter("_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'Switzerland' or _2 = 'Japan') ").count());
+                                    .forEach(
-
+                                            c ->
-       Dataset<Tuple2<String, String>> countryExplodedWithCountryProvenance = verificationDs
+                                                    prova.add(
-               .flatMap(row -> {
+                                                            new Tuple2<>(
-                   List<Tuple2<String, String>> prova = new ArrayList();
+                                                                    row.getId(),
-                   List<Country> country_list = row.getCountry();
+                                                                    c.getDataInfo()
-                   country_list.stream().forEach(c -> prova.add(new Tuple2<>(row.getId(), c.getDataInfo().getInferenceprovenance())));
+                                                                            .getInferenceprovenance())));
-                   return prova.iterator();
+                            return prova.iterator();
-               }, Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
+                        },
-
+                        Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
       Assertions.assertEquals(7, countryExplodedWithCountryProvenance.filter("_2 = 'propagation'").count());
   }
        Assertions.assertEquals(
                7, countryExplodedWithCountryProvenance.filter("_2 = 'propagation'").count());
    }
 }
--- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java
+++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java
@ -1,8 +1,10 @@
 package eu.dnetlib.dhp.orcidtoresultfromsemrel;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.Result2OrganizationJobTest;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import org.apache.commons.io.FileUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
@ -17,23 +19,18 @@ import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 public class OrcidPropagationJobTest {
-    private static final Logger log = LoggerFactory.getLogger(Result2OrganizationJobTest.class);
+    private static final Logger log = LoggerFactory.getLogger(OrcidPropagationJobTest.class);
    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-    private static final ClassLoader cl = Result2OrganizationJobTest.class.getClassLoader();
+    private static final ClassLoader cl = OrcidPropagationJobTest.class.getClassLoader();
    private static SparkSession spark;
    private static Path workingDir;
    @BeforeAll
    public static void beforeAll() throws IOException {
        workingDir = Files.createTempDirectory(OrcidPropagationJobTest.class.getSimpleName());
@ -49,13 +46,11 @@ public class OrcidPropagationJobTest {
        conf.set("spark.sql.warehouse.dir", workingDir.toString());
        conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
-
+        spark =
-        spark = SparkSession
+                SparkSession.builder()
-                .builder()
+                        .appName(OrcidPropagationJobTest.class.getSimpleName())
-                .appName(OrcidPropagationJobTest.class.getSimpleName())
+                        .config(conf)
-                .config(conf)
+                        .getOrCreate();
                .getOrCreate();
    }
    @AfterAll
@ -64,132 +59,194 @@ public class OrcidPropagationJobTest {
        spark.stop();
    }
    @Test
-    public void noUpdateTest()throws Exception{
+    public void noUpdateTest() throws Exception {
-        SparkOrcidToResultFromSemRelJob3.main(new String[]{
+        SparkOrcidToResultFromSemRelJob3.main(
-                "-isTest", Boolean.TRUE.toString(),
+                new String[] {
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+                    "-isTest",
-                "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate").getPath(),
+                    Boolean.TRUE.toString(),
-                "-hive_metastore_uris", "",
+                    "-isSparkSessionManaged",
-                "-saveGraph","true",
+                    Boolean.FALSE.toString(),
-                "-resultTableName","eu.dnetlib.dhp.schema.oaf.Dataset",
+                    "-sourcePath",
-                "-outputPath",workingDir.toString() + "/dataset",
+                    getClass()
-                "-possibleUpdatesPath", getClass().getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc").getPath()
+                            .getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate")
-        });
+                            .getPath(),
                    "-hive_metastore_uris",
                    "",
                    "-saveGraph",
                    "true",
                    "-resultTableName",
                    "eu.dnetlib.dhp.schema.oaf.Dataset",
                    "-outputPath",
                    workingDir.toString() + "/dataset",
                    "-possibleUpdatesPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc")
                            .getPath()
                });
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-        JavaRDD<Dataset> tmp = sc.textFile(workingDir.toString()+"/dataset")
+        JavaRDD<Dataset> tmp =
-                .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+                sc.textFile(workingDir.toString() + "/dataset")
                        .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
-        //tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s));
+        // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s));
        Assertions.assertEquals(10, tmp.count());
-        org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+        org.apache.spark.sql.Dataset<Dataset> verificationDataset =
                spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
        verificationDataset.createOrReplaceTempView("dataset");
-        String query = "select id " +
+        String query =
-                "from dataset " +
+                "select id "
-                "lateral view explode(author) a as MyT " +
+                        + "from dataset "
-                "lateral view explode(MyT.pid) p as MyP " +
+                        + "lateral view explode(author) a as MyT "
-                "where MyP.datainfo.inferenceprovenance = 'propagation'";
+                        + "lateral view explode(MyT.pid) p as MyP "
                        + "where MyP.datainfo.inferenceprovenance = 'propagation'";
        Assertions.assertEquals(0, spark.sql(query).count());
    }
    @Test
-    public void oneUpdateTest() throws Exception{
+    public void oneUpdateTest() throws Exception {
-        SparkOrcidToResultFromSemRelJob3.main(new String[]{
+        SparkOrcidToResultFromSemRelJob3.main(
-                "-isTest", Boolean.TRUE.toString(),
+                new String[] {
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+                    "-isTest",
-                "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate").getPath(),
+                    Boolean.TRUE.toString(),
-                "-hive_metastore_uris", "",
+                    "-isSparkSessionManaged",
-                "-saveGraph","true",
+                    Boolean.FALSE.toString(),
-                "-resultTableName","eu.dnetlib.dhp.schema.oaf.Dataset",
+                    "-sourcePath",
-                "-outputPath",workingDir.toString() + "/dataset",
+                    getClass()
-                "-possibleUpdatesPath", getClass().getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc").getPath()
+                            .getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate")
-        });
+                            .getPath(),
                    "-hive_metastore_uris",
                    "",
                    "-saveGraph",
                    "true",
                    "-resultTableName",
                    "eu.dnetlib.dhp.schema.oaf.Dataset",
                    "-outputPath",
                    workingDir.toString() + "/dataset",
                    "-possibleUpdatesPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc")
                            .getPath()
                });
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-        JavaRDD<Dataset> tmp = sc.textFile(workingDir.toString()+"/dataset")
+        JavaRDD<Dataset> tmp =
-                .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+                sc.textFile(workingDir.toString() + "/dataset")
                        .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
-        //tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s));
+        // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s));
        Assertions.assertEquals(10, tmp.count());
-        org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+        org.apache.spark.sql.Dataset<Dataset> verificationDataset =
                spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
        verificationDataset.createOrReplaceTempView("dataset");
-        String query = "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType " +
+        String query =
-                "from dataset " +
+                "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType "
-                "lateral view explode(author) a as MyT " +
+                        + "from dataset "
-                "lateral view explode(MyT.pid) p as MyP " +
+                        + "lateral view explode(author) a as MyT "
-                "where MyP.datainfo.inferenceprovenance = 'propagation'";
+                        + "lateral view explode(MyT.pid) p as MyP "
                        + "where MyP.datainfo.inferenceprovenance = 'propagation'";
        org.apache.spark.sql.Dataset<Row> propagatedAuthors = spark.sql(query);
        Assertions.assertEquals(1, propagatedAuthors.count());
-        Assertions.assertEquals(1, propagatedAuthors.filter("id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' " +
+        Assertions.assertEquals(
-                "and name = 'Vajinder' and surname = 'Kumar' and pidType = 'ORCID'").count());
+                1,
                propagatedAuthors
                        .filter(
                                "id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' "
                                        + "and name = 'Vajinder' and surname = 'Kumar' and pidType = 'ORCID'")
                        .count());
        Assertions.assertEquals(1, propagatedAuthors.filter("pid = '0000-0002-8825-3517'").count());
    }
    @Test
-    public void twoUpdatesTest() throws Exception{
+    public void twoUpdatesTest() throws Exception {
-        SparkOrcidToResultFromSemRelJob3.main(new String[]{
+        SparkOrcidToResultFromSemRelJob3.main(
-                "-isTest", Boolean.TRUE.toString(),
+                new String[] {
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+                    "-isTest",
-                "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates").getPath(),
+                    Boolean.TRUE.toString(),
-                "-hive_metastore_uris", "",
+                    "-isSparkSessionManaged",
-                "-saveGraph","true",
+                    Boolean.FALSE.toString(),
-                "-resultTableName","eu.dnetlib.dhp.schema.oaf.Dataset",
+                    "-sourcePath",
-                "-outputPath",workingDir.toString() + "/dataset",
+                    getClass()
-                "-possibleUpdatesPath", getClass().getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc").getPath()
+                            .getResource(
-        });
+                                    "/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates")
                            .getPath(),
                    "-hive_metastore_uris",
                    "",
                    "-saveGraph",
                    "true",
                    "-resultTableName",
                    "eu.dnetlib.dhp.schema.oaf.Dataset",
                    "-outputPath",
                    workingDir.toString() + "/dataset",
                    "-possibleUpdatesPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc")
                            .getPath()
                });
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-        JavaRDD<Dataset> tmp = sc.textFile(workingDir.toString()+"/dataset")
+        JavaRDD<Dataset> tmp =
-                .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+                sc.textFile(workingDir.toString() + "/dataset")
                        .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
        Assertions.assertEquals(10, tmp.count());
-        org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
+        org.apache.spark.sql.Dataset<Dataset> verificationDataset =
                spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
        verificationDataset.createOrReplaceTempView("dataset");
-        String query = "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType " +
+        String query =
-                "from dataset " +
+                "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType "
-                "lateral view explode(author) a as MyT " +
+                        + "from dataset "
-                "lateral view explode(MyT.pid) p as MyP " +
+                        + "lateral view explode(author) a as MyT "
-                "where MyP.datainfo.inferenceprovenance = 'propagation'";
+                        + "lateral view explode(MyT.pid) p as MyP "
                        + "where MyP.datainfo.inferenceprovenance = 'propagation'";
        org.apache.spark.sql.Dataset<Row> propagatedAuthors = spark.sql(query);
        Assertions.assertEquals(2, propagatedAuthors.count());
-        Assertions.assertEquals(1, propagatedAuthors.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
+        Assertions.assertEquals(
-        Assertions.assertEquals(1, propagatedAuthors.filter("name = 'Ruediger' and surname = 'Beckhaus'").count());
+                1, propagatedAuthors.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
        Assertions.assertEquals(
                1, propagatedAuthors.filter("name = 'Ruediger' and surname = 'Beckhaus'").count());
-        query = "select id, MyT.name name, MyT.surname surname, MyP.value pid ,MyP.qualifier.classid pidType " +
+        query =
-                "from dataset " +
+                "select id, MyT.name name, MyT.surname surname, MyP.value pid ,MyP.qualifier.classid pidType "
-                "lateral view explode(author) a as MyT " +
+                        + "from dataset "
-                "lateral view explode(MyT.pid) p as MyP ";
+                        + "lateral view explode(author) a as MyT "
                        + "lateral view explode(MyT.pid) p as MyP ";
        org.apache.spark.sql.Dataset<Row> authorsExplodedPids = spark.sql(query);
-        Assertions.assertEquals(2, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
+        Assertions.assertEquals(
-        Assertions.assertEquals(1, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann' and pidType = 'MAG Identifier'").count());
+                2, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
-
+        Assertions.assertEquals(
                1,
                authorsExplodedPids
                        .filter(
                                "name = 'Marc' and surname = 'Schmidtmann' and pidType = 'MAG Identifier'")
                        .count());
    }
 }
--- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java
+++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java
@ -2,6 +2,9 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import org.apache.commons.io.FileUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
@ -16,10 +19,6 @@ import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 public class Result2OrganizationJobTest {
    private static final Logger log = LoggerFactory.getLogger(Result2OrganizationJobTest.class);
@ -32,10 +31,11 @@ public class Result2OrganizationJobTest {
    private static Path workingDir;
    @BeforeAll
    public static void beforeAll() throws IOException {
-        workingDir = Files.createTempDirectory(SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName());
+        workingDir =
                Files.createTempDirectory(
                        SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName());
        log.info("using work dir {}", workingDir);
        SparkConf conf = new SparkConf();
@ -48,13 +48,11 @@ public class Result2OrganizationJobTest {
        conf.set("spark.sql.warehouse.dir", workingDir.toString());
        conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
-
+        spark =
-        spark = SparkSession
+                SparkSession.builder()
-                .builder()
+                        .appName(SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName())
-                .appName(SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName())
+                        .config(conf)
-                .config(conf)
+                        .getOrCreate();
                .getOrCreate();
    }
    @AfterAll
@ -65,109 +63,225 @@ public class Result2OrganizationJobTest {
    /**
     * No modifications done to the sample sets, so that no possible updates are created
     *
     * @throws Exception
     */
    @Test
    public void NoUpdateTest() throws Exception {
-        SparkResultToOrganizationFromIstRepoJob2.main(new String[]{
+        SparkResultToOrganizationFromIstRepoJob2.main(
-                "-isTest", Boolean.TRUE.toString(),
+                new String[] {
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+                    "-isTest",
-                "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix").getPath(),
+                    Boolean.TRUE.toString(),
-                "-hive_metastore_uris", "",
+                    "-isSparkSessionManaged",
-                "-resultTableName","eu.dnetlib.dhp.schema.oaf.Software",
+                    Boolean.FALSE.toString(),
-                "-writeUpdate", "false",
+                    "-sourcePath",
-                "-saveGraph", "true",
+                    getClass()
-                "-outputPath", workingDir.toString() + "/relation",
+                            .getResource(
-                "-datasourceOrganizationPath", getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization").getPath(),
+                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix")
-                "-alreadyLinkedPath",getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked").getPath(),
+                            .getPath(),
-        });
+                    "-hive_metastore_uris",
                    "",
                    "-resultTableName",
                    "eu.dnetlib.dhp.schema.oaf.Software",
                    "-writeUpdate",
                    "false",
                    "-saveGraph",
                    "true",
                    "-outputPath",
                    workingDir.toString() + "/relation",
                    "-datasourceOrganizationPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization")
                            .getPath(),
                    "-alreadyLinkedPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked")
                            .getPath(),
                });
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-        JavaRDD<Relation> tmp = sc.textFile(workingDir.toString()+"/relation")
+        JavaRDD<Relation> tmp =
-                .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+                sc.textFile(workingDir.toString() + "/relation")
                        .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
        Assertions.assertEquals(0, tmp.count());
    }
    /**
-     * Testing set with modified association between datasource and organization. Copied some hostedby collectedfrom
+     * Testing set with modified association between datasource and organization. Copied some
-     * from the software sample set. No intersection with the already linked (all the possible new relations, will became
+     * hostedby collectedfrom from the software sample set. No intersection with the already linked
-     * new relations)
+     * (all the possible new relations, will became new relations)
     *
     * @throws Exception
     */
    @Test
    public void UpdateNoMixTest() throws Exception {
-        SparkResultToOrganizationFromIstRepoJob2.main(new String[]{
+        SparkResultToOrganizationFromIstRepoJob2.main(
-                "-isTest", Boolean.TRUE.toString(),
+                new String[] {
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+                    "-isTest",
-                "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix").getPath(),
+                    Boolean.TRUE.toString(),
-                "-hive_metastore_uris", "",
+                    "-isSparkSessionManaged",
-                "-resultTableName","eu.dnetlib.dhp.schema.oaf.Software",
+                    Boolean.FALSE.toString(),
-                "-writeUpdate", "false",
+                    "-sourcePath",
-                "-saveGraph", "true",
+                    getClass()
-                "-outputPath", workingDir.toString() + "/relation",
+                            .getResource(
-                "-datasourceOrganizationPath", getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization").getPath(),
+                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix")
-                "-alreadyLinkedPath",getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked").getPath(),
+                            .getPath(),
-        });
+                    "-hive_metastore_uris",
                    "",
                    "-resultTableName",
                    "eu.dnetlib.dhp.schema.oaf.Software",
                    "-writeUpdate",
                    "false",
                    "-saveGraph",
                    "true",
                    "-outputPath",
                    workingDir.toString() + "/relation",
                    "-datasourceOrganizationPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization")
                            .getPath(),
                    "-alreadyLinkedPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked")
                            .getPath(),
                });
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-        JavaRDD<Relation> tmp = sc.textFile(workingDir.toString()+"/relation")
+        JavaRDD<Relation> tmp =
-                .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+                sc.textFile(workingDir.toString() + "/relation")
                        .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
        Assertions.assertEquals(20, tmp.count());
-        Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
+        Dataset<Relation> verificationDs =
-        Assertions.assertEquals(8, verificationDs.filter("target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2'").count());
+                spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
-        Assertions.assertEquals(1, verificationDs.filter("target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091'").count());
+        Assertions.assertEquals(
-        Assertions.assertEquals(1, verificationDs.filter("target = '20|opendoar____::4429502fa1936b0941f4647b69b844c8'").count());
+                8,
                verificationDs
                        .filter("target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2'")
                        .count());
        Assertions.assertEquals(
                1,
                verificationDs
                        .filter("target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091'")
                        .count());
        Assertions.assertEquals(
                1,
                verificationDs
                        .filter("target = '20|opendoar____::4429502fa1936b0941f4647b69b844c8'")
                        .count());
-        Assertions.assertEquals(2, verificationDs.filter("source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and " +
+        Assertions.assertEquals(
-                "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' " +
+                2,
-                "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')").count());
+                verificationDs
                        .filter(
                                "source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and "
                                        + "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' "
                                        + "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')")
                        .count());
    }
    @Test
    public void UpdateMixTest() throws Exception {
-        SparkResultToOrganizationFromIstRepoJob2.main(new String[]{
+        SparkResultToOrganizationFromIstRepoJob2.main(
-                "-isTest", Boolean.TRUE.toString(),
+                new String[] {
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+                    "-isTest",
-                "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix").getPath(),
+                    Boolean.TRUE.toString(),
-                "-hive_metastore_uris", "",
+                    "-isSparkSessionManaged",
-                "-resultTableName","eu.dnetlib.dhp.schema.oaf.Software",
+                    Boolean.FALSE.toString(),
-                "-writeUpdate", "false",
+                    "-sourcePath",
-                "-saveGraph", "true",
+                    getClass()
-                "-outputPath", workingDir.toString() + "/relation",
+                            .getResource(
-                "-datasourceOrganizationPath", getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization").getPath(),
+                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix")
-                "-alreadyLinkedPath",getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked").getPath(),
+                            .getPath(),
-        });
+                    "-hive_metastore_uris",
                    "",
                    "-resultTableName",
                    "eu.dnetlib.dhp.schema.oaf.Software",
                    "-writeUpdate",
                    "false",
                    "-saveGraph",
                    "true",
                    "-outputPath",
                    workingDir.toString() + "/relation",
                    "-datasourceOrganizationPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization")
                            .getPath(),
                    "-alreadyLinkedPath",
                    getClass()
                            .getResource(
                                    "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked")
                            .getPath(),
                });
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-        JavaRDD<Relation> tmp = sc.textFile(workingDir.toString()+"/relation")
+        JavaRDD<Relation> tmp =
-                .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
+                sc.textFile(workingDir.toString() + "/relation")
                        .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
-        Dataset<Relation> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
+        Dataset<Relation> verificationDs =
                spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
        Assertions.assertEquals(8, verificationDs.count());
-        Assertions.assertEquals(2, verificationDs.filter("source = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6'").count());
+        Assertions.assertEquals(
-        Assertions.assertEquals(1, verificationDs.filter("source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218'").count());
+                2,
-        Assertions.assertEquals(1, verificationDs.filter("source = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523'").count());
+                verificationDs
                        .filter("source = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6'")
                        .count());
        Assertions.assertEquals(
                1,
                verificationDs
                        .filter("source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218'")
                        .count());
        Assertions.assertEquals(
                1,
                verificationDs
                        .filter("source = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523'")
                        .count());
-        Assertions.assertEquals(1, verificationDs.filter("source = '20|wt__________::a72760363ca885e6bef165804770e00c'").count());
+        Assertions.assertEquals(
                1,
                verificationDs
                        .filter("source = '20|wt__________::a72760363ca885e6bef165804770e00c'")
                        .count());
-        Assertions.assertEquals(4, verificationDs.filter("relclass = 'hasAuthorInstitution' and substring(source, 1,2) = '50'").count());
+        Assertions.assertEquals(
-        Assertions.assertEquals(4, verificationDs.filter("relclass = 'isAuthorInstitutionOf' and substring(source, 1,2) = '20'").count());
+                4,
                verificationDs
                        .filter(
                                "relclass = 'hasAuthorInstitution' and substring(source, 1,2) = '50'")
                        .count());
        Assertions.assertEquals(
                4,
                verificationDs
                        .filter(
                                "relclass = 'isAuthorInstitutionOf' and substring(source, 1,2) = '20'")
                        .count());
-        Assertions.assertEquals(4, verificationDs.filter("relclass = 'hasAuthorInstitution' and " +
+        Assertions.assertEquals(
-                "substring(source, 1,2) = '50' and substring(target, 1, 2) = '20'").count());
+                4,
-        Assertions.assertEquals(4, verificationDs.filter("relclass = 'isAuthorInstitutionOf' and " +
+                verificationDs
-                "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'").count());
+                        .filter(
                                "relclass = 'hasAuthorInstitution' and "
                                        + "substring(source, 1,2) = '50' and substring(target, 1, 2) = '20'")
                        .count());
        Assertions.assertEquals(
                4,
                verificationDs
                        .filter(
                                "relclass = 'isAuthorInstitutionOf' and "
                                        + "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'")
                        .count());
    }
 }