Merge pull request '[eosc tag] set the eosc subjects, rough implementation' (#215) from eosc_tag into beta

Reviewed-on: D-Net/dnet-hadoop#215
2022-05-04 10:11:14 +02:00 · 2022-05-04 10:11:14 +02:00 · 5fe25cc51c
parent da611cfbbd 8a72de4011
commit 5fe25cc51c
13 changed files with 928 additions and 1 deletions
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java
@ -0,0 +1,243 @@
 package eu.dnetlib.dhp.bulktag;
 import static eu.dnetlib.dhp.PropagationConstant.readPath;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 public class SparkEoscTag {
 	private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
 	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
 		.qualifier(
 			"eosc",
 			"European Open Science Cloud",
 			ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
 	public static final DataInfo EOSC_DATAINFO = OafMapperUtils
 		.dataInfo(
 			false, "propagation", true, false,
 			OafMapperUtils
 				.qualifier(
 					"propagation:subject", "Inferred by OpenAIRE",
 					ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
 			"0.9");
 	public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils
 		.structuredProperty(
 			"EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO);
 	public final static StructuredProperty EOSC_GALAXY = OafMapperUtils
 		.structuredProperty(
 			"EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO);
 	public final static StructuredProperty EOSC_TWITTER = OafMapperUtils
 		.structuredProperty(
 			"EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO);
 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				SparkEoscTag.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/bulktag/input_eoscTag_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String inputPath = parser.get("sourcePath");
 		log.info("inputPath: {}", inputPath);
 		final String workingPath = parser.get("workingPath");
 		log.info("workingPath: {}", workingPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				execEoscTag(spark, inputPath, workingPath);
 			});
 	}
 	private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) {
 		readPath(spark, inputPath + "/software", Software.class)
 			.map((MapFunction<Software, Software>) s -> {
 				List<StructuredProperty> sbject;
 				if (!Optional.ofNullable(s.getSubject()).isPresent())
 					s.setSubject(new ArrayList<>());
 				sbject = s.getSubject();
 				if (containsCriteriaNotebook(s)) {
 					sbject.add(EOSC_NOTEBOOK);
 				}
 				if (containsCriteriaGalaxy(s)) {
 					sbject.add(EOSC_GALAXY);
 				}
 				return s;
 			}, Encoders.bean(Software.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingPath + "/software");
 		readPath(spark, workingPath + "/software", Software.class)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(inputPath + "/software");
 		readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
 			.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> {
 				List<StructuredProperty> sbject;
 				if (!Optional.ofNullable(orp.getSubject()).isPresent())
 					orp.setSubject(new ArrayList<>());
 				sbject = orp.getSubject();
 				if (containsCriteriaGalaxy(orp)) {
 					sbject.add(EOSC_GALAXY);
 				}
 				if (containscriteriaTwitter(orp)) {
 					sbject.add(EOSC_TWITTER);
 				}
 				return orp;
 			}, Encoders.bean(OtherResearchProduct.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingPath + "/otherresearchproduct");
 		readPath(spark, workingPath + "/otherresearchproduct", OtherResearchProduct.class)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(inputPath + "/otherresearchproduct");
 		readPath(spark, inputPath + "/dataset", Dataset.class)
 			.map((MapFunction<Dataset, Dataset>) d -> {
 				List<StructuredProperty> sbject;
 				if (!Optional.ofNullable(d.getSubject()).isPresent())
 					d.setSubject(new ArrayList<>());
 				sbject = d.getSubject();
 				if (containscriteriaTwitter(d)) {
 					sbject.add(EOSC_TWITTER);
 				}
 				return d;
 			}, Encoders.bean(Dataset.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingPath + "/dataset");
 		readPath(spark, workingPath + "/dataset", Dataset.class)
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(inputPath + "/dataset");
 	}
 	private static boolean containscriteriaTwitter(Result r) {
 		Set<String> words = getWordsSP(r.getTitle());
 		words.addAll(getWordsF(r.getDescription()));
 		if (words.contains("twitter") &&
 			(words.contains("data") || words.contains("dataset")))
 			return true;
 		if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
 			r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
 			return true;
 		return false;
 	}
 	private static boolean containsCriteriaGalaxy(Result r) {
 		Set<String> words = getWordsSP(r.getTitle());
 		words.addAll(getWordsF(r.getDescription()));
 		if (words.contains("galaxy") &&
 			words.contains("workflow"))
 			return true;
 		if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
 			r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow")))
 			return true;
 		return false;
 	}
 	private static boolean containsCriteriaNotebook(Software s) {
 		if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
 			return true;
 		if (s
 			.getSubject()
 			.stream()
 			.anyMatch(
 				sbj -> sbj.getValue().toLowerCase().contains("python") &&
 					sbj.getValue().toLowerCase().contains("notebook")))
 			return true;
 		if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
 			s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("notebook")))
 			return true;
 		return false;
 	}
 	private static Set<String> getSubjects(List<StructuredProperty> s) {
 		Set<String> subjects = new HashSet<>();
 		s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" "))));
 		s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase()));
 		return subjects;
 	}
 	private static Set<String> getWordsSP(List<StructuredProperty> elem) {
 		Set<String> words = new HashSet<>();
 		Optional
 			.ofNullable(elem)
 			.ifPresent(
 				e -> e
 					.forEach(
 						t -> words
 							.addAll(
 								Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))));
 		return words;
 	}
 	private static Set<String> getWordsF(List<Field<String>> elem) {
 		Set<String> words = new HashSet<>();
 		Optional
 			.ofNullable(elem)
 			.ifPresent(
 				e -> e
 					.forEach(
 						t -> words
 							.addAll(
 								Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))));
 //		elem
 //			.forEach(
 //				t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
 		return words;
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eoscTag_parameters.json
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eoscTag_parameters.json
@ -0,0 +1,21 @@
 [
  {
    "paramName":"s",
    "paramLongName":"sourcePath",
    "paramDescription": "the path of the sequencial file to read",
    "paramRequired": true
  },
  {
    "paramName": "wp",
    "paramLongName": "workingPath",
    "paramDescription": "the path used to store temporary output files",
    "paramRequired": true
  },
  {
    "paramName": "ssm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "true if the spark session is managed, false otherwise",
    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml
@ -204,7 +204,31 @@
        <error to="Kill"/>
    </action>
-    <join name="wait" to="End"/>
+    <join name="wait" to="eosc_tag"/>
    <action name="eosc_tag">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>EOSC_tagging</name>
            <class>eu.dnetlib.dhp.bulktag.SparkEoscTag</class>
            <jar>dhp-enrichment-${projectVersion}.jar</jar>
            <spark-opts>
                --num-executors=${sparkExecutorNumber}
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${outputPath}</arg>
            <arg>--workingPath</arg><arg>${workingDir}/eoscTag</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java
@ -0,0 +1,547 @@
 package eu.dnetlib.dhp.bulktag;
 import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.oaf.*;
 public class EOSCTagJobTest {
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static SparkSession spark;
 	private static Path workingDir;
 	private static final Logger log = LoggerFactory.getLogger(EOSCTagJobTest.class);
 	@BeforeAll
 	public static void beforeAll() throws IOException {
 		workingDir = Files.createTempDirectory(EOSCTagJobTest.class.getSimpleName());
 		log.info("using work dir {}", workingDir);
 		SparkConf conf = new SparkConf();
 		conf.setAppName(EOSCTagJobTest.class.getSimpleName());
 		conf.setMaster("local[*]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.toString());
 		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
 		spark = SparkSession
 			.builder()
 			.appName(EOSCTagJobTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void afterAll() throws IOException {
 		FileUtils.deleteDirectory(workingDir.toFile());
 		spark.stop();
 	}
 	@Test
 	void jupyterUpdatesTest() throws Exception {
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
 			.map(
 				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
 				Encoders.bean(Software.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/software");
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
 			.map(
 				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
 				Encoders.bean(Dataset.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/dataset");
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
 			.map(
 				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
 					.readValue(value, OtherResearchProduct.class),
 				Encoders.bean(OtherResearchProduct.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/otherresearchproduct");
 		SparkEoscTag
 			.main(
 				new String[] {
 					"-isSparkSessionManaged", Boolean.FALSE.toString(),
 					"-sourcePath",
 					workingDir.toString() + "/input",
 					"-workingPath", workingDir.toString() + "/working"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Software> tmp = sc
 			.textFile(workingDir.toString() + "/input/software")
 			.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
 		Assertions.assertEquals(10, tmp.count());
 		Assertions
 			.assertEquals(
 				4,
 				tmp
 					.filter(
 						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
 					.count());
 		Assertions
 			.assertEquals(
 				2, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		Assertions
 			.assertEquals(
 				5, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertFalse(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		Assertions
 			.assertEquals(
 				9, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		Assertions
 			.assertEquals(
 				5, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertFalse(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		Assertions
 			.assertEquals(
 				9, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		List<StructuredProperty> subjects = tmp
 			.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
 			.collect()
 			.get(0)
 			.getSubject();
 		Assertions.assertEquals(8, subjects.size());
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("algorithme")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("simulation numérique")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de gaz")));
 		Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de liquide")));
 		Assertions
 			.assertEquals(
 				10, sc
 					.textFile(workingDir.toString() + "/input/dataset")
 					.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
 					.count());
 		Assertions
 			.assertEquals(
 				0, sc
 					.textFile(workingDir.toString() + "/input/dataset")
 					.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
 					.filter(
 						ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
 					.count());
 		Assertions
 			.assertEquals(
 				10, sc
 					.textFile(workingDir.toString() + "/input/otherresearchproduct")
 					.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
 					.count());
 		Assertions
 			.assertEquals(
 				0, sc
 					.textFile(workingDir.toString() + "/input/otherresearchproduct")
 					.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
 					.filter(
 						ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
 					.count());
 		// spark.stop();
 	}
 	@Test
 	void galaxyUpdatesTest() throws Exception {
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
 			.map(
 				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
 				Encoders.bean(Software.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/software");
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
 			.map(
 				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
 				Encoders.bean(Dataset.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/dataset");
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
 			.map(
 				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
 					.readValue(value, OtherResearchProduct.class),
 				Encoders.bean(OtherResearchProduct.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/otherresearchproduct");
 		SparkEoscTag
 			.main(
 				new String[] {
 					"-isSparkSessionManaged", Boolean.FALSE.toString(),
 					"-sourcePath",
 					workingDir.toString() + "/input",
 					"-workingPath", workingDir.toString() + "/working"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Software> tmp = sc
 			.textFile(workingDir.toString() + "/input/software")
 			.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
 		Assertions.assertEquals(11, tmp.count());
 		Assertions
 			.assertEquals(
 				2,
 				tmp
 					.filter(
 						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
 					.count());
 		Assertions
 			.assertEquals(
 				2, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 		Assertions
 			.assertEquals(
 				6, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 		Assertions
 			.assertEquals(
 				8, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertFalse(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 		JavaRDD<OtherResearchProduct> orp = sc
 			.textFile(workingDir.toString() + "/input/otherresearchproduct")
 			.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
 		Assertions.assertEquals(10, orp.count());
 		Assertions
 			.assertEquals(
 				2,
 				orp
 					.filter(
 						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
 					.count());
 		Assertions
 			.assertEquals(
 				3, orp
 					.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				orp
 					.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 		Assertions
 			.assertEquals(
 				2, orp
 					.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertFalse(
 				orp
 					.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 		Assertions
 			.assertEquals(
 				3, orp
 					.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				orp
 					.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 	}
 	@Test
 	void twitterUpdatesTest() throws Exception {
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/software").getPath())
 			.map(
 				(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
 				Encoders.bean(Software.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/software");
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/dataset").getPath())
 			.map(
 				(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
 				Encoders.bean(Dataset.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/dataset");
 		spark
 			.read()
 			.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct").getPath())
 			.map(
 				(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
 					.readValue(value, OtherResearchProduct.class),
 				Encoders.bean(OtherResearchProduct.class))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(workingDir.toString() + "/input/otherresearchproduct");
 		SparkEoscTag
 			.main(
 				new String[] {
 					"-isSparkSessionManaged", Boolean.FALSE.toString(),
 					"-sourcePath",
 					workingDir.toString() + "/input",
 					"-workingPath", workingDir.toString() + "/working"
 				});
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 		JavaRDD<Software> tmp = sc
 			.textFile(workingDir.toString() + "/input/software")
 			.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
 		Assertions.assertEquals(10, tmp.count());
 		Assertions
 			.assertEquals(
 				0,
 				tmp
 					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
 					.count());
 		JavaRDD<OtherResearchProduct> orp = sc
 			.textFile(workingDir.toString() + "/input/otherresearchproduct")
 			.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
 		Assertions.assertEquals(10, orp.count());
 		Assertions
 			.assertEquals(
 				3,
 				orp
 					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
 					.count());
 		JavaRDD<Dataset> dats = sc
 			.textFile(workingDir.toString() + "/input/dataset")
 			.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
 		Assertions.assertEquals(11, dats.count());
 		Assertions
 			.assertEquals(
 				3,
 				dats
 					.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
 					.count());
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/galaxy/dataset/dataset_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/galaxy/dataset/dataset_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct/otherresearchproduct_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct/otherresearchproduct_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/galaxy/software/software_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/galaxy/software/software_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/dataset/dataset_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/dataset/dataset_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct/otherresearchproduct_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct/otherresearchproduct_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/software/software_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/software/software_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/dataset/dataset_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/dataset/dataset_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct/otherresearchproduct_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct/otherresearchproduct_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/software/software_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/twitter/software/software_10.json