[EOSCTag] last test and change in the implementation to search in title and descriptio

2022-05-02 17:43:20 +02:00 · 2022-05-02 17:43:20 +02:00 · a21fe310e5
parent e37177e1ce
commit a21fe310e5
7 changed files with 620 additions and 291 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -584,12 +584,10 @@ case object Crossref2Oaf {
        if (dp.length == 10) {
          return GraphCleaningFunctions.cleanDate(dp)
        }
-      }
-      else if (res.size ==2) {
+      } else if (res.size == 2) {
        val dp = f"${res.head}-${res(1)}%02d-01"
        return GraphCleaningFunctions.cleanDate(dp)
-      }
-      else if (res.size ==1) {
+      } else if (res.size == 1) {
        return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01")
      }
    }
--- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/crossref/CrossrefMappingTest.scala
@ -73,10 +73,10 @@ class CrossrefMappingTest {

  }

-
  @Test
  def crossrefIssueDateTest(): Unit = {
-    val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
+    val json =
+      Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
    assertNotNull(json)
    assertFalse(json.isEmpty)
    val resultList: List[Oaf] = Crossref2Oaf.convert(json)
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java
@ -1,10 +1,14 @@
+
 package eu.dnetlib.dhp.bulktag;

-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import static eu.dnetlib.dhp.PropagationConstant.readPath;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
@ -14,29 +18,38 @@ import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Optional;
+import com.fasterxml.jackson.databind.ObjectMapper;

-import static eu.dnetlib.dhp.PropagationConstant.readPath;
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;

 public class SparkEoscTag {
 	private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
 	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-    public static final Qualifier EOSC_QUALIFIER = OafMapperUtils.qualifier("eosc",
+	public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
+		.qualifier(
+			"eosc",
 			"European Open Science Cloud",
-            ModelConstants.DNET_SUBJECT_TYPOLOGIES,ModelConstants.DNET_SUBJECT_TYPOLOGIES);
-    public static final DataInfo EOSC_DATAINFO = OafMapperUtils.dataInfo(false, "propagation", true, false,
-            OafMapperUtils.qualifier("propagation:subject","Inferred by OpenAIRE",
-                    ModelConstants.DNET_PROVENANCE_ACTIONS,ModelConstants.DNET_PROVENANCE_ACTIONS), "0.9");
-    public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils.structuredProperty(
-            "EOSC::Jupyter Notebook", EOSC_QUALIFIER,EOSC_DATAINFO);
-    public final static StructuredProperty EOSC_GALAXY = OafMapperUtils.structuredProperty(
+			ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
+	public static final DataInfo EOSC_DATAINFO = OafMapperUtils
+		.dataInfo(
+			false, "propagation", true, false,
+			OafMapperUtils
+				.qualifier(
+					"propagation:subject", "Inferred by OpenAIRE",
+					ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
+			"0.9");
+	public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils
+		.structuredProperty(
+			"EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO);
+	public final static StructuredProperty EOSC_GALAXY = OafMapperUtils
+		.structuredProperty(
 			"EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO);
-    public final static StructuredProperty EOSC_TWITTER = OafMapperUtils.structuredProperty(
-            "EOSC::Twitter Data", EOSC_QUALIFIER,EOSC_DATAINFO);
+	public final static StructuredProperty EOSC_TWITTER = OafMapperUtils
+		.structuredProperty(
+			"EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO);

 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
@ -80,50 +93,49 @@ public class SparkEoscTag {
 					s.setSubject(new ArrayList<>());
 				sbject = s.getSubject();

-                    if(containsCriteriaNotebook(s)){
+				if (containsCriteriaNotebook(s)) {
 					sbject.add(EOSC_NOTEBOOK);

 				}
-                    if(containsCriteriaGalaxy(s)){
+				if (containsCriteriaGalaxy(s)) {
 					sbject.add(EOSC_GALAXY);
 				}
 				return s;
-                }, Encoders.bean(Software.class) )
+			}, Encoders.bean(Software.class))
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(workingPath + "/software");

-        readPath(spark, workingPath + "/software" , Software.class)
+		readPath(spark, workingPath + "/software", Software.class)
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(inputPath + "/software");

 		readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
-                .map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp ->
-                {
+			.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> {
 				List<StructuredProperty> sbject;
 				if (!Optional.ofNullable(orp.getSubject()).isPresent())
 					orp.setSubject(new ArrayList<>());
 				sbject = orp.getSubject();
-                    if(containsCriteriaGalaxy(orp)){
+				if (containsCriteriaGalaxy(orp)) {
 					sbject.add(EOSC_GALAXY);
 				}
-                    if(containscriteriaTwitter(orp)){
+				if (containscriteriaTwitter(orp)) {
 					sbject.add(EOSC_TWITTER);
 				}
 				return orp;
 			}, Encoders.bean(OtherResearchProduct.class))
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(workingPath + "/otherresearchproduct");

 		readPath(spark, workingPath + "/otherresearchproduct", OtherResearchProduct.class)
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(inputPath + "/otherresearchproduct");

 		readPath(spark, inputPath + "/dataset", Dataset.class)
@ -132,44 +144,45 @@ public class SparkEoscTag {
 				if (!Optional.ofNullable(d.getSubject()).isPresent())
 					d.setSubject(new ArrayList<>());
 				sbject = d.getSubject();
-                    if(containscriteriaTwitter(d)){
+				if (containscriteriaTwitter(d)) {
 					sbject.add(EOSC_TWITTER);
 				}
 				return d;
-                } , Encoders.bean(Dataset.class) )
+			}, Encoders.bean(Dataset.class))
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(workingPath + "/dataset");

-        readPath(spark, workingPath + "/dataset" , Dataset.class)
+		readPath(spark, workingPath + "/dataset", Dataset.class)
 			.write()
 			.mode(SaveMode.Overwrite)
-                .option("compression","gzip")
+			.option("compression", "gzip")
 			.json(inputPath + "/dataset");
 	}

 	private static boolean containscriteriaTwitter(Result r) {
-        if (r.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("twitter") &&
-                t.getValue().toLowerCase().contains("data")))
+		Set<String> words = getWordsSP(r.getTitle());
+		words.addAll(getWordsF(r.getDescription()));
+
+		if (words.contains("twitter") &&
+			(words.contains("data") || words.contains("dataset")))
 			return true;
-        if(r.getDescription().stream().anyMatch(d -> d.getValue().toLowerCase().contains("twitter") &&
-                d.getValue().toLowerCase().contains("data") ))
-            return true;
-        if(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
+
+		if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
 			r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
 			return true;
 		return false;
 	}

 	private static boolean containsCriteriaGalaxy(Result r) {
-        if (r.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("galaxy") &&
-                (t.getValue().toLowerCase().contains("workflow") || t.getValue().toLowerCase().contains("software"))))
+		Set<String> words = getWordsSP(r.getTitle());
+		words.addAll(getWordsF(r.getDescription()));
+		if (words.contains("galaxy") &&
+			(words.contains("workflow") || words.contains("software")))
 			return true;
-        if(r.getDescription().stream().anyMatch(d -> d.getValue().toLowerCase().contains("galaxy") &&
-                (d.getValue().toLowerCase().contains("workflow") || d.getValue().toLowerCase().contains("software"))))
-            return true;
-        if(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
+
+		if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
 			(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow"))) ||
 			r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("software")))
 			return true;
@ -177,22 +190,42 @@ public class SparkEoscTag {
 	}

 	private static boolean containsCriteriaNotebook(Software s) {
-        if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
+		if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
 			return true;
-        if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python") &&
+		if (s
+			.getSubject()
+			.stream()
+			.anyMatch(
+				sbj -> sbj.getValue().toLowerCase().contains("python") &&
 					sbj.getValue().toLowerCase().contains("notebook")))
 			return true;
-        if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
+		if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
 			s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("notebook")))
 			return true;
 		return false;
 	}

-    private static boolean containsTitleNotebook(Software s) {
-        if (s.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("jupyter") &&
-                t.getValue().toLowerCase().contains("notebook")))
-            return true;
-        return false;
+	private static Set<String> getSubjects(List<StructuredProperty> s) {
+		Set<String> subjects = new HashSet<>();
+		s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" "))));
+		s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase()));
+		return subjects;
 	}

+	private static Set<String> getWordsSP(List<StructuredProperty> elem) {
+		Set<String> words = new HashSet<>();
+		elem
+			.forEach(
+				t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
+		return words;
+	}
+
+	private static Set<String> getWordsF(List<Field<String>> elem) {
+		Set<String> words = new HashSet<>();
+		elem
+			.forEach(
+				t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
+		return words;
+
+	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java
@ -1,8 +1,13 @@

 package eu.dnetlib.dhp.bulktag;

-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.schema.oaf.*;
+import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
@ -11,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
@ -19,26 +25,20 @@ import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.List;
+import com.fasterxml.jackson.databind.ObjectMapper;

-import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
+import eu.dnetlib.dhp.schema.oaf.*;

 public class EOSCTagJobTest {

 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

-
-
 	private static SparkSession spark;

 	private static Path workingDir;

 	private static final Logger log = LoggerFactory.getLogger(EOSCTagJobTest.class);

-
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(EOSCTagJobTest.class.getSimpleName());
@ -70,22 +70,38 @@ public class EOSCTagJobTest {
 	@Test
 	void jupyterUpdatesTest() throws Exception {

-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
-				.map((MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class), Encoders.bean(Software.class))
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
+			.map(
+				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
+				Encoders.bean(Software.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/software");

-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
-				.map((MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class), Encoders.bean(Dataset.class))
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
+			.map(
+				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
+				Encoders.bean(Dataset.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/dataset");

-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
-				.map((MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER.readValue(value, OtherResearchProduct.class), Encoders.bean(OtherResearchProduct.class))
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
+			.map(
+				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
+					.readValue(value, OtherResearchProduct.class),
+				Encoders.bean(OtherResearchProduct.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/otherresearchproduct");

 		SparkEoscTag
@ -106,36 +122,109 @@ public class EOSCTagJobTest {

 		Assertions.assertEquals(10, tmp.count());

-		Assertions.assertEquals(4, tmp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
+		Assertions
+			.assertEquals(
+				4,
+				tmp
+					.filter(
+						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
+					.count());

-		Assertions.assertEquals(2, tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+		Assertions
+			.assertEquals(
+				2, tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertTrue(
+				tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));

+		Assertions
+			.assertEquals(
+				5, tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertFalse(
+				tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));

-		Assertions.assertEquals(5, tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+		Assertions
+			.assertEquals(
+				9, tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertTrue(
+				tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));

-		Assertions.assertEquals(9, tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+		Assertions
+			.assertEquals(
+				5, tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertFalse(
+				tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));

-		Assertions.assertEquals(5, tmp.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
+		Assertions
+			.assertEquals(
+				9, tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertTrue(
+				tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));

-		Assertions.assertEquals(9, tmp.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
-
-		List<StructuredProperty> subjects = tmp.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")).collect()
-				.get(0).getSubject();
+		List<StructuredProperty> subjects = tmp
+			.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
+			.collect()
+			.get(0)
+			.getSubject();
 		Assertions.assertEquals(8, subjects.size());
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter")));
@ -146,44 +235,75 @@ public class EOSCTagJobTest {
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de gaz")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de liquide")));

-
-		Assertions.assertEquals(10, sc
+		Assertions
+			.assertEquals(
+				10, sc
 					.textFile(workingDir.toString() + "/input/dataset")
-				.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).count());
+					.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
+					.count());

-		Assertions.assertEquals(0, sc
+		Assertions
+			.assertEquals(
+				0, sc
 					.textFile(workingDir.toString() + "/input/dataset")
-				.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).filter(ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
+					.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
+					.filter(
+						ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
+					.count());

-
-		Assertions.assertEquals(10, sc
+		Assertions
+			.assertEquals(
+				10, sc
 					.textFile(workingDir.toString() + "/input/otherresearchproduct")
-				.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)).count());
+					.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
+					.count());

-		Assertions.assertEquals(0, sc
+		Assertions
+			.assertEquals(
+				0, sc
 					.textFile(workingDir.toString() + "/input/otherresearchproduct")
-				.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)).filter(ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
+					.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
+					.filter(
+						ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
+					.count());
+
+		// spark.stop();
 	}

-
 	@Test
 	void galaxyUpdatesTest() throws Exception {
-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
-				.map((MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class), Encoders.bean(Software.class))
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
+			.map(
+				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
+				Encoders.bean(Software.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/software");

-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
-				.map((MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class), Encoders.bean(Dataset.class))
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
+			.map(
+				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
+				Encoders.bean(Dataset.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/dataset");

-		spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
-				.map((MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER.readValue(value, OtherResearchProduct.class), Encoders.bean(OtherResearchProduct.class))
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
+			.map(
+				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
+					.readValue(value, OtherResearchProduct.class),
+				Encoders.bean(OtherResearchProduct.class))
 			.write()
-				.option("compression","gzip")
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/otherresearchproduct");

 		SparkEoscTag
@ -204,46 +324,224 @@ public class EOSCTagJobTest {

 		Assertions.assertEquals(10, tmp.count());

-		Assertions.assertEquals(2, tmp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))).count());
+		Assertions
+			.assertEquals(
+				2,
+				tmp
+					.filter(
+						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
+					.count());

-		Assertions.assertEquals(2, tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+		Assertions
+			.assertEquals(
+				2, tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertTrue(
+				tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));

+		Assertions
+			.assertEquals(
+				6, tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertTrue(
+				tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));

-		Assertions.assertEquals(6, tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+		Assertions
+			.assertEquals(
+				8, tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertFalse(
+				tmp
+					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));

-		Assertions.assertEquals(8, tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
-
-		JavaRDD<OtherResearchProduct> orp = sc.textFile(workingDir.toString() + "/input/otherresearchproduct").map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
+		JavaRDD<OtherResearchProduct> orp = sc
+			.textFile(workingDir.toString() + "/input/otherresearchproduct")
+			.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));

 		Assertions.assertEquals(10, orp.count());

-		Assertions.assertEquals(2, orp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))).count());
+		Assertions
+			.assertEquals(
+				2,
+				orp
+					.filter(
+						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
+					.count());

+		Assertions
+			.assertEquals(
+				3, orp
+					.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertTrue(
+				orp
+					.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));

-		Assertions.assertEquals(3, orp.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertTrue(orp.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+		Assertions
+			.assertEquals(
+				2, orp
+					.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertFalse(
+				orp
+					.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));

-		Assertions.assertEquals(2, orp.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertFalse(orp.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
-
-		Assertions.assertEquals(3, orp.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")).collect()
-				.get(0).getSubject().size());
-		Assertions.assertTrue(orp.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")).collect()
-				.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
+		Assertions
+			.assertEquals(
+				3, orp
+					.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.size());
+		Assertions
+			.assertTrue(
+				orp
+					.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
+					.collect()
+					.get(0)
+					.getSubject()
+					.stream()
+					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));

 	}

+	@Test
+	void twitterUpdatesTest() throws Exception {
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/software").getPath())
+			.map(
+				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
+				Encoders.bean(Software.class))
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json(workingDir.toString() + "/input/software");
+
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/dataset").getPath())
+			.map(
+				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
+				Encoders.bean(Dataset.class))
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json(workingDir.toString() + "/input/dataset");
+
+		spark
+			.read()
+			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct").getPath())
+			.map(
+				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
+					.readValue(value, OtherResearchProduct.class),
+				Encoders.bean(OtherResearchProduct.class))
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.json(workingDir.toString() + "/input/otherresearchproduct");
+
+		SparkEoscTag
+			.main(
+				new String[] {
+					"-isSparkSessionManaged", Boolean.FALSE.toString(),
+					"-sourcePath",
+					workingDir.toString() + "/input",
+					"-workingPath", workingDir.toString() + "/working"
+
+				});
+
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+		JavaRDD<Software> tmp = sc
+			.textFile(workingDir.toString() + "/input/software")
+			.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
+
+		Assertions.assertEquals(10, tmp.count());
+
+		Assertions
+			.assertEquals(
+				0,
+				tmp
+					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
+					.count());
+
+		JavaRDD<OtherResearchProduct> orp = sc
+			.textFile(workingDir.toString() + "/input/otherresearchproduct")
+			.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
+
+		Assertions.assertEquals(10, orp.count());
+
+		Assertions
+			.assertEquals(
+				3,
+				orp
+					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
+					.count());
+
+		JavaRDD<Dataset> dats = sc
+			.textFile(workingDir.toString() + "/input/dataset")
+			.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
+
+		Assertions.assertEquals(10, dats.count());
+
+		Assertions
+			.assertEquals(
+				2,
+				dats
+					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
+					.count());
+
+	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/dataset/dataset_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/dataset/dataset_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct/otherresearchproduct_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct/otherresearchproduct_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/software/software_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/software/software_10.json