[EOSCTag] last test and change in the implementation to search in title and descriptio

2022-05-02 17:43:20 +02:00 · 2022-05-02 17:43:20 +02:00 · a21fe310e5
parent e37177e1ce
commit a21fe310e5
7 changed files with 620 additions and 291 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -584,12 +584,10 @@ case object Crossref2Oaf {
        if (dp.length == 10) {
          return GraphCleaningFunctions.cleanDate(dp)
        }
-      }
+      } else if (res.size == 2) {
      else if (res.size ==2) {
        val dp = f"${res.head}-${res(1)}%02d-01"
        return GraphCleaningFunctions.cleanDate(dp)
-      }
+      } else if (res.size == 1) {
      else if (res.size ==1) {
        return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01")
      }
    }
--- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala
@ -73,10 +73,10 @@ class CrossrefMappingTest {
  }
  @Test
  def crossrefIssueDateTest(): Unit = {
-    val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
+    val json =
      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
    assertNotNull(json)
    assertFalse(json.isEmpty)
    val resultList: List[Oaf] = Crossref2Oaf.convert(json)
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java
@ -1,10 +1,14 @@
 package eu.dnetlib.dhp.bulktag;
-import com.fasterxml.jackson.databind.ObjectMapper;
+import static eu.dnetlib.dhp.PropagationConstant.readPath;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
+
-import eu.dnetlib.dhp.schema.oaf.*;
+import java.util.*;
-import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
@ -14,29 +18,38 @@ import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import java.util.ArrayList;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Optional;
-import static eu.dnetlib.dhp.PropagationConstant.readPath;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 public class SparkEoscTag {
 	private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
 	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-    public static final Qualifier EOSC_QUALIFIER = OafMapperUtils.qualifier("eosc",
+	public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
 		.qualifier(
 			"eosc",
 			"European Open Science Cloud",
-            ModelConstants.DNET_SUBJECT_TYPOLOGIES,ModelConstants.DNET_SUBJECT_TYPOLOGIES);
+			ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
-    public static final DataInfo EOSC_DATAINFO = OafMapperUtils.dataInfo(false, "propagation", true, false,
+	public static final DataInfo EOSC_DATAINFO = OafMapperUtils
-            OafMapperUtils.qualifier("propagation:subject","Inferred by OpenAIRE",
+		.dataInfo(
-                    ModelConstants.DNET_PROVENANCE_ACTIONS,ModelConstants.DNET_PROVENANCE_ACTIONS), "0.9");
+			false, "propagation", true, false,
-    public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils.structuredProperty(
+			OafMapperUtils
-            "EOSC::Jupyter Notebook", EOSC_QUALIFIER,EOSC_DATAINFO);
+				.qualifier(
-    public final static StructuredProperty EOSC_GALAXY = OafMapperUtils.structuredProperty(
+					"propagation:subject", "Inferred by OpenAIRE",
 					ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 			"0.9");
 	public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils
 		.structuredProperty(
 			"EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO);
 	public final static StructuredProperty EOSC_GALAXY = OafMapperUtils
 		.structuredProperty(
 			"EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO);
-    public final static StructuredProperty EOSC_TWITTER = OafMapperUtils.structuredProperty(
+	public final static StructuredProperty EOSC_TWITTER = OafMapperUtils
-            "EOSC::Twitter Data", EOSC_QUALIFIER,EOSC_DATAINFO);
+		.structuredProperty(
 			"EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO);
 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
@ -80,50 +93,49 @@ public class SparkEoscTag {
 					s.setSubject(new ArrayList<>());
 				sbject = s.getSubject();
-                    if(containsCriteriaNotebook(s)){
+				if (containsCriteriaNotebook(s)) {
 					sbject.add(EOSC_NOTEBOOK);
 				}
-                    if(containsCriteriaGalaxy(s)){
+				if (containsCriteriaGalaxy(s)) {
 					sbject.add(EOSC_GALAXY);
 				}
 				return s;
-                }, Encoders.bean(Software.class) )
+			}, Encoders.bean(Software.class))
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(workingPath + "/software");
-        readPath(spark, workingPath + "/software" , Software.class)
+		readPath(spark, workingPath + "/software", Software.class)
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(inputPath + "/software");
 		readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
-                .map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp ->
+			.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> {
                {
 				List<StructuredProperty> sbject;
 				if (!Optional.ofNullable(orp.getSubject()).isPresent())
 					orp.setSubject(new ArrayList<>());
 				sbject = orp.getSubject();
-                    if(containsCriteriaGalaxy(orp)){
+				if (containsCriteriaGalaxy(orp)) {
 					sbject.add(EOSC_GALAXY);
 				}
-                    if(containscriteriaTwitter(orp)){
+				if (containscriteriaTwitter(orp)) {
 					sbject.add(EOSC_TWITTER);
 				}
 				return orp;
 			}, Encoders.bean(OtherResearchProduct.class))
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(workingPath + "/otherresearchproduct");
 		readPath(spark, workingPath + "/otherresearchproduct", OtherResearchProduct.class)
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(inputPath + "/otherresearchproduct");
 		readPath(spark, inputPath + "/dataset", Dataset.class)
@ -132,44 +144,45 @@ public class SparkEoscTag {
 				if (!Optional.ofNullable(d.getSubject()).isPresent())
 					d.setSubject(new ArrayList<>());
 				sbject = d.getSubject();
-                    if(containscriteriaTwitter(d)){
+				if (containscriteriaTwitter(d)) {
 					sbject.add(EOSC_TWITTER);
 				}
 				return d;
-                } , Encoders.bean(Dataset.class) )
+			}, Encoders.bean(Dataset.class))
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(workingPath + "/dataset");
-        readPath(spark, workingPath + "/dataset" , Dataset.class)
+		readPath(spark, workingPath + "/dataset", Dataset.class)
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(inputPath + "/dataset");
 	}
 	private static boolean containscriteriaTwitter(Result r) {
-        if (r.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("twitter") &&
+		Set<String> words = getWordsSP(r.getTitle());
-                t.getValue().toLowerCase().contains("data")))
+		words.addAll(getWordsF(r.getDescription()));
 		if (words.contains("twitter") &&
 			(words.contains("data") || words.contains("dataset")))
 			return true;
-        if(r.getDescription().stream().anyMatch(d -> d.getValue().toLowerCase().contains("twitter") &&
+
-                d.getValue().toLowerCase().contains("data") ))
+		if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
            return true;
        if(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
 			r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
 			return true;
 		return false;
 	}
 	private static boolean containsCriteriaGalaxy(Result r) {
-        if (r.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("galaxy") &&
+		Set<String> words = getWordsSP(r.getTitle());
-                (t.getValue().toLowerCase().contains("workflow") || t.getValue().toLowerCase().contains("software"))))
+		words.addAll(getWordsF(r.getDescription()));
 		if (words.contains("galaxy") &&
 			(words.contains("workflow") || words.contains("software")))
 			return true;
-        if(r.getDescription().stream().anyMatch(d -> d.getValue().toLowerCase().contains("galaxy") &&
+
-                (d.getValue().toLowerCase().contains("workflow") || d.getValue().toLowerCase().contains("software"))))
+		if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
            return true;
        if(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
 			(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow"))) ||
 			r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("software")))
 			return true;
@ -177,22 +190,42 @@ public class SparkEoscTag {
 	}
 	private static boolean containsCriteriaNotebook(Software s) {
-        if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
+		if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
 			return true;
-        if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python") &&
+		if (s
 			.getSubject()
 			.stream()
 			.anyMatch(
 				sbj -> sbj.getValue().toLowerCase().contains("python") &&
 					sbj.getValue().toLowerCase().contains("notebook")))
 			return true;
-        if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
+		if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
 			s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("notebook")))
 			return true;
 		return false;
 	}
-    private static boolean containsTitleNotebook(Software s) {
+	private static Set<String> getSubjects(List<StructuredProperty> s) {
-        if (s.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("jupyter") &&
+		Set<String> subjects = new HashSet<>();
-                t.getValue().toLowerCase().contains("notebook")))
+		s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" "))));
-            return true;
+		s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase()));
-        return false;
+		return subjects;
 	}
 	private static Set<String> getWordsSP(List<StructuredProperty> elem) {
 		Set<String> words = new HashSet<>();
 		elem
 			.forEach(
 				t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
 		return words;
 	}
 	private static Set<String> getWordsF(List<Field<String>> elem) {
 		Set<String> words = new HashSet<>();
 		elem
 			.forEach(
 				t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
 		return words;
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java
@ -1,8 +1,13 @@
 package eu.dnetlib.dhp.bulktag;
-import com.fasterxml.jackson.databind.ObjectMapper;
+import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
-import eu.dnetlib.dhp.schema.oaf.*;
+
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
@ -11,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
@ -19,26 +25,20 @@ import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import java.io.IOException;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
-import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
+import eu.dnetlib.dhp.schema.oaf.*;
 public class EOSCTagJobTest {
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static SparkSession spark;
 	private static Path workingDir;
 	private static final Logger log = LoggerFactory.getLogger(EOSCTagJobTest.class);
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(EOSCTagJobTest.class.getSimpleName());
@ -70,22 +70,38 @@ public class EOSCTagJobTest {
 	@Test
 	void jupyterUpdatesTest() throws Exception {
-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
+		spark
-				.map((MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class), Encoders.bean(Software.class))
+			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
 			.map(
 				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
 				Encoders.bean(Software.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/software");
-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
+		spark
-				.map((MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class), Encoders.bean(Dataset.class))
+			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
 			.map(
 				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
 				Encoders.bean(Dataset.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/dataset");
-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
+		spark
-				.map((MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER.readValue(value, OtherResearchProduct.class), Encoders.bean(OtherResearchProduct.class))
+			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
 			.map(
 				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
 					.readValue(value, OtherResearchProduct.class),
 				Encoders.bean(OtherResearchProduct.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/otherresearchproduct");
 		SparkEoscTag
@ -106,36 +122,109 @@ public class EOSCTagJobTest {
 		Assertions.assertEquals(10, tmp.count());
-		Assertions.assertEquals(4, tmp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
+		Assertions
 			.assertEquals(
 				4,
 				tmp
 					.filter(
 						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
 					.count());
-		Assertions.assertEquals(2, tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
+		Assertions
-				.get(0).getSubject().size());
+			.assertEquals(
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
+				2, tmp
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		Assertions
 			.assertEquals(
 				5, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertFalse(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
-		Assertions.assertEquals(5, tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
+		Assertions
-				.get(0).getSubject().size());
+			.assertEquals(
-		Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
+				9, tmp
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
-		Assertions.assertEquals(9, tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
+		Assertions
-				.get(0).getSubject().size());
+			.assertEquals(
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
+				5, tmp
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+					.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertFalse(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
-		Assertions.assertEquals(5, tmp.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")).collect()
+		Assertions
-				.get(0).getSubject().size());
+			.assertEquals(
-		Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")).collect()
+				9, tmp
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+					.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
-		Assertions.assertEquals(9, tmp.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")).collect()
+		List<StructuredProperty> subjects = tmp
-				.get(0).getSubject().size());
+			.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")).collect()
+			.collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+			.get(0)
-
+			.getSubject();
 		List<StructuredProperty> subjects = tmp.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")).collect()
 				.get(0).getSubject();
 		Assertions.assertEquals(8, subjects.size());
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter")));
@ -146,44 +235,75 @@ public class EOSCTagJobTest {
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de gaz")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de liquide")));
-
+		Assertions
-		Assertions.assertEquals(10, sc
+			.assertEquals(
 				10, sc
 					.textFile(workingDir.toString() + "/input/dataset")
-				.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).count());
+					.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
 					.count());
-		Assertions.assertEquals(0, sc
+		Assertions
 			.assertEquals(
 				0, sc
 					.textFile(workingDir.toString() + "/input/dataset")
-				.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).filter(ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
+					.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
 					.filter(
 						ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
 					.count());
-
+		Assertions
-		Assertions.assertEquals(10, sc
+			.assertEquals(
 				10, sc
 					.textFile(workingDir.toString() + "/input/otherresearchproduct")
-				.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)).count());
+					.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
 					.count());
-		Assertions.assertEquals(0, sc
+		Assertions
 			.assertEquals(
 				0, sc
 					.textFile(workingDir.toString() + "/input/otherresearchproduct")
-				.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)).filter(ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
+					.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
 					.filter(
 						ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
 					.count());
 		// spark.stop();
 	}
 	@Test
 	void galaxyUpdatesTest() throws Exception {
-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
+		spark
-				.map((MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class), Encoders.bean(Software.class))
+			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
 			.map(
 				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
 				Encoders.bean(Software.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/software");
-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
+		spark
-				.map((MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class), Encoders.bean(Dataset.class))
+			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
 			.map(
 				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
 				Encoders.bean(Dataset.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/dataset");
-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
+		spark
-				.map((MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER.readValue(value, OtherResearchProduct.class), Encoders.bean(OtherResearchProduct.class))
+			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
 			.map(
 				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
 					.readValue(value, OtherResearchProduct.class),
 				Encoders.bean(OtherResearchProduct.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/otherresearchproduct");
 		SparkEoscTag
@ -204,46 +324,224 @@ public class EOSCTagJobTest {
 		Assertions.assertEquals(10, tmp.count());
-		Assertions.assertEquals(2, tmp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))).count());
+		Assertions
 			.assertEquals(
 				2,
 				tmp
 					.filter(
 						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
 					.count());
-		Assertions.assertEquals(2, tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
+		Assertions
-				.get(0).getSubject().size());
+			.assertEquals(
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
+				2, tmp
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 		Assertions
 			.assertEquals(
 				6, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
-		Assertions.assertEquals(6, tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
+		Assertions
-				.get(0).getSubject().size());
+			.assertEquals(
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
+				8, tmp
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertFalse(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
-		Assertions.assertEquals(8, tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
+		JavaRDD<OtherResearchProduct> orp = sc
-				.get(0).getSubject().size());
+			.textFile(workingDir.toString() + "/input/otherresearchproduct")
-		Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
+			.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
 				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 		JavaRDD<OtherResearchProduct> orp = sc.textFile(workingDir.toString() + "/input/otherresearchproduct").map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
 		Assertions.assertEquals(10, orp.count());
-		Assertions.assertEquals(2, orp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))).count());
+		Assertions
 			.assertEquals(
 				2,
 				orp
 					.filter(
 						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
 					.count());
 		Assertions
 			.assertEquals(
 				3, orp
 					.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				orp
 					.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
-		Assertions.assertEquals(3, orp.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")).collect()
+		Assertions
-				.get(0).getSubject().size());
+			.assertEquals(
-		Assertions.assertTrue(orp.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")).collect()
+				2, orp
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+					.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertFalse(
 				orp
 					.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
-		Assertions.assertEquals(2, orp.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5")).collect()
+		Assertions
-				.get(0).getSubject().size());
+			.assertEquals(
-		Assertions.assertFalse(orp.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5")).collect()
+				3, orp
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+					.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
-
+					.collect()
-		Assertions.assertEquals(3, orp.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")).collect()
+					.get(0)
-				.get(0).getSubject().size());
+					.getSubject()
-		Assertions.assertTrue(orp.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")).collect()
+					.size());
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+		Assertions
 			.assertTrue(
 				orp
 					.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 	}
 	@Test
 	void twitterUpdatesTest() throws Exception {
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/software").getPath())
 			.map(
 				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
 				Encoders.bean(Software.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/software");
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/dataset").getPath())
 			.map(
 				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
 				Encoders.bean(Dataset.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/dataset");
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct").getPath())
 			.map(
 				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
 					.readValue(value, OtherResearchProduct.class),
 				Encoders.bean(OtherResearchProduct.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/otherresearchproduct");
 		SparkEoscTag
 			.main(
 				new String[] {
 					"-isSparkSessionManaged", Boolean.FALSE.toString(),
 					"-sourcePath",
 					workingDir.toString() + "/input",
 					"-workingPath", workingDir.toString() + "/working"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Software> tmp = sc
 			.textFile(workingDir.toString() + "/input/software")
 			.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
 		Assertions.assertEquals(10, tmp.count());
 		Assertions
 			.assertEquals(
 				0,
 				tmp
 					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
 					.count());
 		JavaRDD<OtherResearchProduct> orp = sc
 			.textFile(workingDir.toString() + "/input/otherresearchproduct")
 			.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
 		Assertions.assertEquals(10, orp.count());
 		Assertions
 			.assertEquals(
 				3,
 				orp
 					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
 					.count());
 		JavaRDD<Dataset> dats = sc
 			.textFile(workingDir.toString() + "/input/dataset")
 			.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
 		Assertions.assertEquals(10, dats.count());
 		Assertions
 			.assertEquals(
 				2,
 				dats
 					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
 					.count());
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/dataset/dataset_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/dataset/dataset_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct/otherresearchproduct_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct/otherresearchproduct_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/software/software_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/software/software_10.json