Merge branch 'beta' into graph_cleaning

2022-12-02 14:49:00 +01:00 · 2022-12-02 14:49:00 +01:00 · 8248da40d9
parent 8e3edba318 ddf065756f
commit 8248da40d9
28 changed files with 1424 additions and 181 deletions
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -1,10 +1,12 @@

 package eu.dnetlib.dhp.oa.dedup;

-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
+import java.lang.reflect.InvocationTargetException;
+import java.util.*;
+import java.util.stream.Collectors;

+import org.apache.commons.beanutils.BeanUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Dataset;
@ -15,6 +17,7 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Lists;

+import eu.dnetlib.dhp.oa.dedup.model.Identifier;
 import eu.dnetlib.dhp.oa.merge.AuthorMerger;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
@ -74,33 +77,42 @@ public class DedupRecordFactory {

 	public static <T extends OafEntity> T entityMerger(
 		String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
-		throws IllegalAccessException, InstantiationException {
+		throws IllegalAccessException, InstantiationException, InvocationTargetException {

-		T entity = clazz.newInstance();
-		entity.setDataInfo(dataInfo);
+		final Comparator<Identifier<T>> idComparator = new IdentifierComparator<>();
+
+		final LinkedList<T> entityList = Lists
+			.newArrayList(entities)
+			.stream()
+			.map(t -> Identifier.newInstance(t._2()))
+			.sorted(idComparator)
+			.map(Identifier::getEntity)
+			.collect(Collectors.toCollection(LinkedList::new));
+
+		final T entity = clazz.newInstance();
+		final T first = entityList.removeFirst();
+
+		BeanUtils.copyProperties(entity, first);

 		final Collection<String> dates = Lists.newArrayList();
 		final List<List<Author>> authors = Lists.newArrayList();

-		entities
-			.forEachRemaining(
-				t -> {
-					T duplicate = t._2();
-
+		entityList
+			.forEach(
+				duplicate -> {
 					entity.mergeFrom(duplicate);
 					if (ModelSupport.isSubClass(duplicate, Result.class)) {
 						Result r1 = (Result) duplicate;
-						if (r1.getAuthor() != null && !r1.getAuthor().isEmpty())
+						if (r1.getAuthor() != null && StringUtils.isNotBlank(r1.getDateofacceptance().getValue()))
 							authors.add(r1.getAuthor());
 						if (r1.getDateofacceptance() != null)
 							dates.add(r1.getDateofacceptance().getValue());
 					}
-
 				});

 		// set authors and date
 		if (ModelSupport.isSubClass(entity, Result.class)) {
-			((Result) entity).setDateofacceptance(DatePicker.pick(dates));
+			// ((Result) entity).setDateofacceptance(DatePicker.pick(dates));
 			((Result) entity).setAuthor(AuthorMerger.merge(authors));
 		}

--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java
@ -18,6 +18,10 @@ public class IdGenerator implements Serializable {
 		if (pids == null || pids.isEmpty())
 			return defaultID;

+		return generateId(pids);
+	}
+
+	private static <T extends OafEntity> String generateId(List<Identifier<T>> pids) {
 		Identifier<T> bp = pids
 			.stream()
 			.min(Identifier::compareTo)
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdentifierComparator.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdentifierComparator.java
@ -0,0 +1,81 @@
+
+package eu.dnetlib.dhp.oa.dedup;
+
+import java.util.Comparator;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.dhp.oa.dedup.model.Identifier;
+import eu.dnetlib.dhp.schema.common.EntityType;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
+import eu.dnetlib.dhp.schema.oaf.utils.PidType;
+
+public class IdentifierComparator<T extends OafEntity> implements Comparator<Identifier<T>> {
+
+	public static int compareIdentifiers(Identifier left, Identifier right) {
+		return new IdentifierComparator<>().compare(left, right);
+	}
+
+	@Override
+	public int compare(Identifier<T> left, Identifier<T> i) {
+		// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
+		// alphabetical order of the originalID
+
+		Set<String> lKeys = Optional
+			.ofNullable(left.getCollectedFrom())
+			.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
+			.orElse(Sets.newHashSet());
+
+		final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
+		Set<String> rKeys = cf
+			.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
+			.orElse(Sets.newHashSet());
+
+		if (left.getPidType().compareTo(i.getPidType()) == 0) { // same type
+			if (left.getEntityType() == EntityType.publication) {
+				if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
+					&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
+					return -1;
+				if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
+					&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
+					return 1;
+			}
+			if (left.getEntityType() == EntityType.dataset) {
+				if (isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID)
+					&& !isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID))
+					return -1;
+				if (isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID)
+					&& !isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID))
+					return 1;
+			}
+
+			if (left.getDate().compareTo(i.getDate()) == 0) {// same date
+				// we need to take the alphabetically lower id
+				return left.getOriginalID().compareTo(i.getOriginalID());
+			} else
+				// we need to take the elder date
+				return left.getDate().compareTo(i.getDate());
+		} else {
+			return new PidComparator<>(left.getEntity()).compare(toSP(left.getPidType()), toSP(i.getPidType()));
+		}
+	}
+
+	public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
+		return collectedFrom.contains(dsId);
+	}
+
+	private StructuredProperty toSP(PidType pidType) {
+		return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
+	}
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java
@ -11,6 +11,7 @@ import org.apache.commons.lang3.StringUtils;
 import com.google.common.collect.Sets;

 import eu.dnetlib.dhp.oa.dedup.DatePicker;
+import eu.dnetlib.dhp.oa.dedup.IdentifierComparator;
 import eu.dnetlib.dhp.schema.common.EntityType;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -83,60 +84,12 @@ public class Identifier<T extends OafEntity> implements Serializable, Comparable
 		return entity.getId();
 	}

-	private PidType getPidType() {
+	public PidType getPidType() {
 		return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
 	}

 	@Override
 	public int compareTo(Identifier<T> i) {
-		// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
-		// alphabetical order of the originalID
-
-		Set<String> lKeys = Optional
-			.ofNullable(getCollectedFrom())
-			.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
-			.orElse(Sets.newHashSet());
-
-		final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
-		Set<String> rKeys = cf
-			.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
-			.orElse(Sets.newHashSet());
-
-		if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
-			if (getEntityType() == EntityType.publication) {
-				if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
-					&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
-					return -1;
-				if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
-					&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
-					return 1;
-			}
-			if (getEntityType() == EntityType.dataset) {
-				if (isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID)
-					&& !isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID))
-					return -1;
-				if (isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID)
-					&& !isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID))
-					return 1;
-			}
-
-			if (this.getDate().compareTo(i.getDate()) == 0) {// same date
-				// we need to take the alphabetically lower id
-				return this.getOriginalID().compareTo(i.getOriginalID());
-			} else
-				// we need to take the elder date
-				return this.getDate().compareTo(i.getDate());
-		} else {
-			return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
-		}
-
-	}
-
-	private StructuredProperty toSP(PidType pidType) {
-		return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
-	}
-
-	public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
-		return collectedFrom.contains(dsId);
+		return IdentifierComparator.compareIdentifiers(this, i);
 	}
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@ -7,6 +7,7 @@ import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
 import java.nio.file.Paths;
 import java.util.*;
 import java.util.stream.Collectors;
@ -54,7 +55,7 @@ class EntityMergerTest implements Serializable {
 	}

 	@Test
-	void softwareMergerTest() throws InstantiationException, IllegalAccessException {
+	void softwareMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {

 		List<Tuple2<String, Software>> softwares = readSample(
 			testEntityBasePath + "/software_merge.json", Software.class);
@ -69,7 +70,7 @@ class EntityMergerTest implements Serializable {
 	}

 	@Test
-	void publicationMergerTest() throws InstantiationException, IllegalAccessException {
+	void publicationMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {

 		Publication pub_merged = DedupRecordFactory
 			.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
@ -134,7 +135,7 @@ class EntityMergerTest implements Serializable {
 	}

 	@Test
-	void publicationMergerTest2() throws InstantiationException, IllegalAccessException {
+	void publicationMergerTest2() throws InstantiationException, IllegalAccessException, InvocationTargetException {

 		Publication pub_merged = DedupRecordFactory
 			.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
@ -146,7 +147,7 @@ class EntityMergerTest implements Serializable {
 	}

 	@Test
-	void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
+	void publicationMergerTest3() throws InstantiationException, IllegalAccessException, InvocationTargetException {

 		Publication pub_merged = DedupRecordFactory
 			.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
@ -156,7 +157,8 @@ class EntityMergerTest implements Serializable {
 	}

 	@Test
-	void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
+	void publicationMergerTest4()
+		throws InstantiationException, IllegalStateException, IllegalAccessException, InvocationTargetException {

 		Publication pub_merged = DedupRecordFactory
 			.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
@ -166,7 +168,8 @@ class EntityMergerTest implements Serializable {
 	}

 	@Test
-	void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
+	void publicationMergerTest5()
+		throws InstantiationException, IllegalStateException, IllegalAccessException, InvocationTargetException {

 		System.out
 			.println(
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
@ -4,8 +4,7 @@ package eu.dnetlib.dhp.oa.dedup;
 import static java.nio.file.Files.createTempDirectory;

 import static org.apache.spark.sql.functions.count;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.*;
 import static org.mockito.Mockito.lenient;

 import java.io.File;
@ -14,7 +13,11 @@ import java.io.IOException;
 import java.io.Serializable;
 import java.net.URISyntaxException;
 import java.nio.file.Paths;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;

 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
@ -35,10 +38,13 @@ import org.mockito.Mock;
 import org.mockito.Mockito;
 import org.mockito.junit.jupiter.MockitoExtension;

+import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Sets;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import eu.dnetlib.pace.util.MapDocumentUtil;
@ -105,57 +111,27 @@ public class SparkDedupTest implements Serializable {

 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
-			.thenReturn(
-				IOUtils
-					.toString(
-						SparkDedupTest.class
-							.getResourceAsStream(
-								"/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml")));
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"));

 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization")))
-			.thenReturn(
-				IOUtils
-					.toString(
-						SparkDedupTest.class
-							.getResourceAsStream(
-								"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"));

 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
-			.thenReturn(
-				IOUtils
-					.toString(
-						SparkDedupTest.class
-							.getResourceAsStream(
-								"/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")));
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));

 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software")))
-			.thenReturn(
-				IOUtils
-					.toString(
-						SparkDedupTest.class
-							.getResourceAsStream(
-								"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"));

 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
-			.thenReturn(
-				IOUtils
-					.toString(
-						SparkDedupTest.class
-							.getResourceAsStream(
-								"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"));

 		lenient()
 			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
-			.thenReturn(
-				IOUtils
-					.toString(
-						SparkDedupTest.class
-							.getResourceAsStream(
-								"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"));
 	}

 	@Test
@ -163,11 +139,7 @@ public class SparkDedupTest implements Serializable {
 	void createSimRelsTest() throws Exception {

 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkCreateSimRels.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
+			classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"));

 		parser
 			.parseArgument(
@ -207,7 +179,7 @@ public class SparkDedupTest implements Serializable {
 			.count();

 		assertEquals(3076, orgs_simrel);
-		assertEquals(7040, pubs_simrel);
+		assertEquals(7046, pubs_simrel);
 		assertEquals(336, sw_simrel);
 		assertEquals(442, ds_simrel);
 		assertEquals(6784, orp_simrel);
@ -223,11 +195,7 @@ public class SparkDedupTest implements Serializable {
 	void whitelistSimRelsTest() throws Exception {

 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkWhitelistSimRels.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json")));
+			classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json"));

 		parser
 			.parseArgument(
@ -264,7 +232,7 @@ public class SparkDedupTest implements Serializable {

 		// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
 		assertEquals(3076, orgs_simrel);
-		assertEquals(7040, pubs_simrel);
+		assertEquals(7046, pubs_simrel);
 		assertEquals(442, ds_simrel);
 		assertEquals(6784, orp_simrel);
 //		System.out.println("orgs_simrel = " + orgs_simrel);
@ -306,11 +274,7 @@ public class SparkDedupTest implements Serializable {
 	void cutMergeRelsTest() throws Exception {

 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkCreateMergeRels.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
+			classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));

 		parser
 			.parseArgument(
@ -402,11 +366,7 @@ public class SparkDedupTest implements Serializable {
 	void createMergeRelsTest() throws Exception {

 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkCreateMergeRels.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
+			classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));

 		parser
 			.parseArgument(
@ -427,10 +387,10 @@ public class SparkDedupTest implements Serializable {
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
 			.count();
-		long pubs_mergerel = spark
+		final Dataset<Relation> pubs = spark
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")
-			.count();
+			.as(Encoders.bean(Relation.class));
 		long sw_mergerel = spark
 			.read()
 			.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
@ -445,8 +405,35 @@ public class SparkDedupTest implements Serializable {
 			.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
 			.count();

+		final List<Relation> merges = pubs
+			.filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
+			.collectAsList();
+		assertEquals(3, merges.size());
+		Set<String> dups = Sets
+			.newHashSet(
+				"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
+				"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
+				"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
+		merges.forEach(r -> {
+			assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
+			assertEquals(ModelConstants.DEDUP, r.getSubRelType());
+			assertEquals(ModelConstants.MERGES, r.getRelClass());
+			assertTrue(dups.contains(r.getTarget()));
+		});
+
+		final List<Relation> mergedIn = pubs
+			.filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
+			.collectAsList();
+		assertEquals(3, mergedIn.size());
+		mergedIn.forEach(r -> {
+			assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
+			assertEquals(ModelConstants.DEDUP, r.getSubRelType());
+			assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
+			assertTrue(dups.contains(r.getSource()));
+		});
+
 		assertEquals(1268, orgs_mergerel);
-		assertEquals(1444, pubs_mergerel);
+		assertEquals(1450, pubs.count());
 		assertEquals(286, sw_mergerel);
 		assertEquals(472, ds_mergerel);
 		assertEquals(738, orp_mergerel);
@ -463,11 +450,7 @@ public class SparkDedupTest implements Serializable {
 	void createDedupRecordTest() throws Exception {

 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkCreateDedupRecord.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
+			classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"));
 		parser
 			.parseArgument(
 				new String[] {
@ -483,12 +466,18 @@ public class SparkDedupTest implements Serializable {

 		new SparkCreateDedupRecord(parser, spark).run(isLookUpService);

+		final ObjectMapper mapper = new ObjectMapper()
+			.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+
+		final Dataset<Publication> pubs = spark
+			.read()
+			.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord")
+			.map(
+				(MapFunction<String, Publication>) value -> mapper.readValue(value, Publication.class),
+				Encoders.bean(Publication.class));
 		long orgs_deduprecord = jsc
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord")
 			.count();
-		long pubs_deduprecord = jsc
-			.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord")
-			.count();
 		long sw_deduprecord = jsc
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
 			.count();
@ -499,11 +488,13 @@ public class SparkDedupTest implements Serializable {
 			.count();

 		assertEquals(86, orgs_deduprecord);
-		assertEquals(67, pubs_deduprecord);
+		assertEquals(68, pubs.count());
 		assertEquals(49, sw_deduprecord);
 		assertEquals(97, ds_deduprecord);
 		assertEquals(92, orp_deduprecord);

+		verifyRoot_1(mapper, pubs);
+
 //		System.out.println("orgs_deduprecord = " + orgs_deduprecord);
 //		System.out.println("pubs_deduprecord = " + pubs_deduprecord);
 //		System.out.println("sw_deduprecord = " + sw_deduprecord);
@ -511,16 +502,63 @@ public class SparkDedupTest implements Serializable {
 //		System.out.println("orp_deduprecord = " + orp_deduprecord);
 	}

+	private static void verifyRoot_1(ObjectMapper mapper, Dataset<Publication> pubs) {
+		Publication root = pubs
+			.filter("id = '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
+			.first();
+		assertNotNull(root);
+
+		final Dataset<String> publication = spark
+			.read()
+			.textFile(DedupUtility.createEntityPath(testGraphBasePath, "publication"));
+
+		Publication crossref_duplicate = publication
+			.map(
+				(MapFunction<String, Publication>) value -> mapper.readValue(value, Publication.class),
+				Encoders.bean(Publication.class))
+			.filter("id = '50|doi_________::d5021b53204e4fdeab6ff5d5bc468032'")
+			.collectAsList()
+			.get(0);
+
+		assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
+		assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
+		assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
+
+		Set<String> rootPids = root
+			.getPid()
+			.stream()
+			.map(StructuredProperty::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+		Set<String> dupPids = crossref_duplicate
+			.getPid()
+			.stream()
+			.map(StructuredProperty::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+
+		assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
+		assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
+
+		Optional<Instance> instance_cr = root
+			.getInstance()
+			.stream()
+			.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
+			.findFirst();
+		assertTrue(instance_cr.isPresent());
+		assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
+		assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
+		assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
+		assertEquals(
+			"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
+		assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
+		assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
+	}
+
 	@Test
 	@Order(6)
 	void updateEntityTest() throws Exception {

 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkUpdateEntity.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
+			classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"));
 		parser
 			.parseArgument(
 				new String[] {
@ -587,7 +625,7 @@ public class SparkDedupTest implements Serializable {
 			.distinct()
 			.count();

-		assertEquals(898, publications);
+		assertEquals(902, publications);
 		assertEquals(839, organizations);
 		assertEquals(100, projects);
 		assertEquals(100, datasource);
@ -640,11 +678,7 @@ public class SparkDedupTest implements Serializable {
 	void propagateRelationTest() throws Exception {

 		ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkPropagateRelation.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
+			classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"));
 		parser
 			.parseArgument(
 				new String[] {
@ -714,4 +748,12 @@ public class SparkDedupTest implements Serializable {
 	public boolean isDeletedByInference(String s) {
 		return s.contains("\"deletedbyinference\":true");
 	}
+
+	private static String classPathResourceAsString(String path) throws IOException {
+		return IOUtils
+			.toString(
+				SparkDedupTest.class
+					.getResourceAsStream(path));
+	}
+
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java
@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
 			.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
 			.count();

-		assertEquals(288, orgs_simrel);
+		assertEquals(290, orgs_simrel);
 	}

 	@Test
@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
 			.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
 			.count();

-		assertEquals(324, orgs_simrel);
+		assertEquals(326, orgs_simrel);
 	}

 	@Test
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java
@ -0,0 +1,403 @@
+
+package eu.dnetlib.dhp.oa.dedup;
+
+import static java.nio.file.Files.createTempDirectory;
+
+import static org.apache.spark.sql.functions.count;
+import static org.junit.jupiter.api.Assertions.*;
+import static org.mockito.Mockito.lenient;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+@ExtendWith(MockitoExtension.class)
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+public class SparkPublicationRootsTest implements Serializable {
+
+	@Mock(serializable = true)
+	ISLookUpService isLookUpService;
+
+	private static SparkSession spark;
+	private static String workingPath;
+
+	private static String graphInputPath;
+	private static String graphOutputPath;
+	private static final String testActionSetId = "test-orchestrator";
+
+	private static Path testBaseTmpPath;
+
+	private static final ObjectMapper MAPPER = new ObjectMapper()
+		.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+
+	@BeforeAll
+	public static void init() throws IOException, URISyntaxException {
+
+		testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest.class.getSimpleName() + "-");
+
+		final File entitiesSources = Paths
+			.get(SparkPublicationRootsTest.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI())
+			.toFile();
+
+		FileUtils
+			.copyDirectory(
+				entitiesSources,
+				testBaseTmpPath.resolve("input").toFile());
+
+		workingPath = testBaseTmpPath.resolve("workingPath").toString();
+		graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
+		graphOutputPath = testBaseTmpPath.resolve("output").toString();
+
+		FileUtils.deleteDirectory(new File(workingPath));
+		FileUtils.deleteDirectory(new File(graphOutputPath));
+
+		final SparkConf conf = new SparkConf();
+		conf.set("spark.sql.shuffle.partitions", "10");
+		spark = SparkSession
+			.builder()
+			.appName(SparkPublicationRootsTest.class.getSimpleName())
+			.master("local[*]")
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@BeforeEach
+	public void setUp() throws IOException, ISLookUpException {
+
+		lenient()
+			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml"));
+
+		lenient()
+			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
+	}
+
+	@AfterAll
+	public static void tearDown() throws IOException {
+		FileUtils.deleteDirectory(testBaseTmpPath.toFile());
+		spark.close();
+	}
+
+	@Test
+	@Order(1)
+	void createSimRelsTest() throws Exception {
+		new SparkCreateSimRels(args(
+			"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json",
+			new String[] {
+				"--graphBasePath", graphInputPath,
+				"--actionSetId", testActionSetId,
+				"--isLookUpUrl", "lookupurl",
+				"--workingPath", workingPath,
+				"--numPartitions", "5"
+			}), spark)
+				.run(isLookUpService);
+
+		long pubs_simrel = spark
+			.read()
+			.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
+			.count();
+
+		assertEquals(74, pubs_simrel);
+	}
+
+	@Test
+	@Order(2)
+	void cutMergeRelsTest() throws Exception {
+		new SparkCreateMergeRels(args(
+			"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
+			new String[] {
+				"--graphBasePath", graphInputPath,
+				"--actionSetId", testActionSetId,
+				"--isLookUpUrl", "lookupurl",
+				"--workingPath", workingPath,
+				"--cutConnectedComponent", "3"
+			}), spark)
+				.run(isLookUpService);
+
+		long pubs_mergerel = spark
+			.read()
+			.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
+			.as(Encoders.bean(Relation.class))
+			.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
+			.groupBy("source")
+			.agg(count("target").alias("cnt"))
+			.select("source", "cnt")
+			.where("cnt > 3")
+			.count();
+
+		assertEquals(0, pubs_mergerel);
+
+		FileUtils.deleteDirectory(new File(workingPath + "/" + testActionSetId + "/publication_mergerel"));
+	}
+
+	@Test
+	@Order(3)
+	void createMergeRelsTest() throws Exception {
+		new SparkCreateMergeRels(args(
+			"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
+			new String[] {
+				"--graphBasePath", graphInputPath,
+				"--actionSetId", testActionSetId,
+				"--isLookUpUrl", "lookupurl",
+				"--workingPath", workingPath
+			}), spark)
+				.run(isLookUpService);
+
+		final Dataset<Relation> merges = spark
+			.read()
+			.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
+			.as(Encoders.bean(Relation.class));
+
+		final List<Relation> mergeList = merges
+			.filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
+			.collectAsList();
+		assertEquals(3, mergeList.size());
+		Set<String> dups = Sets
+			.newHashSet(
+				"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
+				"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
+				"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
+		mergeList.forEach(r -> {
+			assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
+			assertEquals(ModelConstants.DEDUP, r.getSubRelType());
+			assertEquals(ModelConstants.MERGES, r.getRelClass());
+			assertTrue(dups.contains(r.getTarget()));
+		});
+
+		final List<Relation> mergedIn = merges
+			.filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
+			.collectAsList();
+		assertEquals(3, mergedIn.size());
+		mergedIn.forEach(r -> {
+			assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
+			assertEquals(ModelConstants.DEDUP, r.getSubRelType());
+			assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
+			assertTrue(dups.contains(r.getSource()));
+		});
+
+		assertEquals(32, merges.count());
+	}
+
+	@Test
+	@Order(4)
+	void createDedupRecordTest() throws Exception {
+		new SparkCreateDedupRecord(args(
+			"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json",
+			new String[] {
+				"--graphBasePath", graphInputPath,
+				"--actionSetId", testActionSetId,
+				"--isLookUpUrl", "lookupurl",
+				"--workingPath", workingPath
+			}), spark)
+				.run(isLookUpService);
+
+		final Dataset<Publication> roots = spark
+			.read()
+			.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
+			.map(asEntity(Publication.class), Encoders.bean(Publication.class));
+
+		assertEquals(3, roots.count());
+
+		final Dataset<Publication> pubs = spark
+			.read()
+			.textFile(DedupUtility.createEntityPath(graphInputPath, "publication"))
+			.map(asEntity(Publication.class), Encoders.bean(Publication.class));
+
+		verifyRoot_case_1(roots, pubs);
+		verifyRoot_case_2(roots, pubs);
+		verifyRoot_case_3(roots, pubs);
+	}
+
+	private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
+		Publication root = roots
+			.filter("id = '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
+			.first();
+		assertNotNull(root);
+
+		Publication crossref_duplicate = pubs
+			.filter("id = '50|doi_________::d5021b53204e4fdeab6ff5d5bc468032'")
+			.collectAsList()
+			.get(0);
+
+		assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
+		assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
+		assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
+
+		Set<String> rootPids = root
+			.getPid()
+			.stream()
+			.map(StructuredProperty::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+		Set<String> dupPids = crossref_duplicate
+			.getPid()
+			.stream()
+			.map(StructuredProperty::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+
+		assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
+		assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
+
+		Optional<Instance> instance_cr = root
+			.getInstance()
+			.stream()
+			.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
+			.findFirst();
+		assertTrue(instance_cr.isPresent());
+		assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
+		assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
+		assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
+		assertEquals(
+			"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
+		assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
+		assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
+	}
+
+	private void verifyRoot_case_2(Dataset<Publication> roots, Dataset<Publication> pubs) {
+		Publication root = roots
+			.filter("id = '50|doi_dedup___::18aff3b55fb6876466a5d4bd82434885'")
+			.first();
+		assertNotNull(root);
+
+		Publication crossref_duplicate = pubs
+			.filter("id = '50|doi_________::18aff3b55fb6876466a5d4bd82434885'")
+			.first();
+
+		// System.err.println(new ObjectMapper().writeValueAsString(root));
+
+		assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
+		assertEquals(crossref_duplicate.getJournal().getIssnOnline(), root.getJournal().getIssnOnline());
+		assertEquals(crossref_duplicate.getJournal().getVol(), root.getJournal().getVol());
+
+		assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
+
+		Set<String> dups_cf = pubs
+			.collectAsList()
+			.stream()
+			.flatMap(p -> p.getCollectedfrom().stream())
+			.map(KeyValue::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+
+		Set<String> root_cf = root
+			.getCollectedfrom()
+			.stream()
+			.map(KeyValue::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+
+		assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
+	}
+
+	private void verifyRoot_case_3(Dataset<Publication> roots, Dataset<Publication> pubs) {
+		Publication root = roots
+			.filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'")
+			.first();
+		assertNotNull(root);
+
+		Publication pivot_duplicate = pubs
+			.filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'")
+			.first();
+
+		assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
+
+		Set<String> dups_cf = pubs
+			.collectAsList()
+			.stream()
+			.flatMap(p -> p.getCollectedfrom().stream())
+			.map(KeyValue::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+
+		Set<String> root_cf = root
+			.getCollectedfrom()
+			.stream()
+			.map(KeyValue::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+
+		assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
+	}
+
+	@Test
+	@Order(6)
+	void updateEntityTest() throws Exception {
+		new SparkUpdateEntity(args(
+			"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json",
+			new String[] {
+				"--graphBasePath", graphInputPath,
+				"--workingPath", workingPath,
+				"--dedupGraphPath", graphOutputPath
+			}), spark)
+				.run(isLookUpService);
+
+		long publications = spark.read().textFile(graphOutputPath + "/publication").count();
+
+		long mergedPubs = spark
+			.read()
+			.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
+			.as(Encoders.bean(Relation.class))
+			.where("relClass=='merges'")
+			.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
+			.distinct()
+			.count();
+
+		assertEquals(19, publications); // 16 originals + 3 roots
+
+		long deletedPubs = spark
+			.read()
+			.textFile(graphOutputPath + "/publication")
+			.map(asEntity(Publication.class), Encoders.bean(Publication.class))
+			.filter("datainfo.deletedbyinference == true")
+			.map((MapFunction<Publication, String>) OafEntity::getId, Encoders.STRING())
+			.distinct()
+			.count();
+
+		assertEquals(mergedPubs, deletedPubs);
+	}
+
+	private static String classPathResourceAsString(String path) throws IOException {
+		return IOUtils
+			.toString(
+				SparkPublicationRootsTest.class
+					.getResourceAsStream(path));
+	}
+
+	private static <T extends OafEntity> MapFunction<String, T> asEntity(Class<T> clazz) {
+		return value -> MAPPER.readValue(value, clazz);
+	}
+
+	private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
+		ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
+		parser.parseArgument(args);
+		return parser;
+	}
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java
@ -0,0 +1,251 @@
+
+package eu.dnetlib.dhp.oa.dedup;
+
+import static java.nio.file.Files.createTempDirectory;
+
+import static org.apache.spark.sql.functions.count;
+import static org.junit.jupiter.api.Assertions.*;
+import static org.mockito.Mockito.lenient;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+@ExtendWith(MockitoExtension.class)
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+public class SparkPublicationRootsTest2 implements Serializable {
+
+	@Mock(serializable = true)
+	ISLookUpService isLookUpService;
+	private static SparkSession spark;
+
+	private static String workingPath;
+
+	private static String graphInputPath;
+
+	private static String graphOutputPath;
+
+	private static final String testActionSetId = "test-orchestrator";
+
+	private static Path testBaseTmpPath;
+
+	private static final ObjectMapper MAPPER = new ObjectMapper()
+		.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+
+	@BeforeAll
+	public static void init() throws IOException, URISyntaxException {
+
+		testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest2.class.getSimpleName() + "-");
+
+		final File entitiesSources = Paths
+			.get(SparkPublicationRootsTest2.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI())
+			.toFile();
+
+		FileUtils
+			.copyDirectory(
+				entitiesSources,
+				testBaseTmpPath.resolve("input").toFile());
+
+		FileUtils
+			.copyFileToDirectory(
+				Paths
+					.get(
+						SparkPublicationRootsTest2.class
+							.getResource(
+								"/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz")
+							.toURI())
+					.toFile(),
+				testBaseTmpPath.resolve("input").resolve("entities").resolve("publication").toFile());
+
+		workingPath = testBaseTmpPath.resolve("workingPath").toString();
+		graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
+		graphOutputPath = testBaseTmpPath.resolve("output").toString();
+
+		final SparkConf conf = new SparkConf();
+		conf.set("spark.sql.shuffle.partitions", "10");
+		spark = SparkSession
+			.builder()
+			.appName(SparkPublicationRootsTest2.class.getSimpleName())
+			.master("local[*]")
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@BeforeEach
+	public void setUp() throws IOException, ISLookUpException {
+
+		lenient()
+			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml"));
+
+		lenient()
+			.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
+			.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
+	}
+
+	@AfterAll
+	public static void tearDown() throws IOException {
+		FileUtils.deleteDirectory(testBaseTmpPath.toFile());
+	}
+
+	@Test
+	@Order(7)
+	void dedupAlteredDatasetTest() throws Exception {
+
+		new SparkCreateSimRels(args(
+			"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json",
+			new String[] {
+				"--graphBasePath", graphInputPath,
+				"--actionSetId", testActionSetId,
+				"--isLookUpUrl", "lookupurl",
+				"--workingPath", workingPath,
+				"--numPartitions", "5"
+			}), spark)
+				.run(isLookUpService);
+
+		new SparkCreateMergeRels(args(
+			"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
+			new String[] {
+				"--graphBasePath", graphInputPath,
+				"--actionSetId", testActionSetId,
+				"--isLookUpUrl", "lookupurl",
+				"--workingPath", workingPath
+			}), spark)
+				.run(isLookUpService);
+
+		final Dataset<Relation> merges = spark
+			.read()
+			.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
+			.as(Encoders.bean(Relation.class));
+
+		assertEquals(
+			3, merges
+				.filter("relclass == 'isMergedIn'")
+				.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
+				.distinct()
+				.count());
+		assertEquals(
+			4, merges
+				.filter("source == '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'")
+				.count());
+
+		new SparkCreateDedupRecord(args(
+			"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json",
+			new String[] {
+				"--graphBasePath", graphInputPath,
+				"--actionSetId", testActionSetId,
+				"--isLookUpUrl", "lookupurl",
+				"--workingPath", workingPath
+			}), spark)
+				.run(isLookUpService);
+
+		final Dataset<Publication> roots = spark
+			.read()
+			.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
+			.map(asEntity(Publication.class), Encoders.bean(Publication.class));
+
+		assertEquals(3, roots.count());
+
+		final Dataset<Publication> pubs = spark
+			.read()
+			.textFile(DedupUtility.createEntityPath(graphInputPath, "publication"))
+			.map(asEntity(Publication.class), Encoders.bean(Publication.class));
+
+		Publication root = roots
+			.filter("id = '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'")
+			.first();
+		assertNotNull(root);
+
+		Publication crossref_duplicate = pubs
+			.filter("id = '50|doi_________::b3aec7985136e36827176aaa1dd5082d'")
+			.collectAsList()
+			.get(0);
+
+		assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue());
+		assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
+		assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
+		assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
+
+		Set<String> rootPids = root
+			.getPid()
+			.stream()
+			.map(StructuredProperty::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+		Set<String> dupPids = crossref_duplicate
+			.getPid()
+			.stream()
+			.map(StructuredProperty::getValue)
+			.collect(Collectors.toCollection(HashSet::new));
+
+		assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
+		assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
+		assertTrue(rootPids.contains("10.1109/jstqe.2023.9999999"));
+
+		Optional<Instance> instance_cr = root
+			.getInstance()
+			.stream()
+			.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
+			.findFirst();
+		assertTrue(instance_cr.isPresent());
+		assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
+		assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
+		assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
+		assertEquals(
+			"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
+		assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
+		assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
+
+	}
+
+	private static String classPathResourceAsString(String path) throws IOException {
+		return IOUtils
+			.toString(
+				SparkPublicationRootsTest2.class
+					.getResourceAsStream(path));
+	}
+
+	private static <T extends OafEntity> MapFunction<String, T> asEntity(Class<T> clazz) {
+		return value -> MAPPER.readValue(value, clazz);
+	}
+
+	private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
+		ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
+		parser.parseArgument(args);
+		return parser;
+	}
+
+}
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java
@ -168,11 +168,11 @@ public class SparkStatsTest implements Serializable {
 			.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
 			.count();

-		assertEquals(477, orgs_blocks);
+		assertEquals(480, orgs_blocks);
 		assertEquals(295, pubs_blocks);
 		assertEquals(122, sw_blocks);
 		assertEquals(191, ds_blocks);
-		assertEquals(171, orp_blocks);
+		assertEquals(178, orp_blocks);
 	}

 	@AfterAll
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/entities/publication/publication.gz
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/entities/publication/publication.gz
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml
@ -0,0 +1,24 @@
+<RESOURCE_PROFILE>
+    <HEADER>
+        <RESOURCE_IDENTIFIER value=""/>
+        <RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/>
+        <RESOURCE_KIND value="DedupOrchestrationDSResources"/>
+        <RESOURCE_URI value=""/>
+        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
+    </HEADER>
+    <BODY>
+        <CONFIGURATION enabled="true">
+            <DEDUPLICATION>
+                <ENTITY code="20" label="Organization" name="organization"/>
+                <ACTION_SET id="test-orchestrator"/>
+                <SCAN_SEQUENCE>
+                    <SCAN id="publication"/>
+                </SCAN_SEQUENCE>
+            </DEDUPLICATION>
+        </CONFIGURATION>
+        <STATUS>
+            <LAST_UPDATE value="2001-12-31T12:00:00"/>
+        </STATUS>
+        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
+    </BODY>
+</RESOURCE_PROFILE>
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/entities/publication/publication_0.gz
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/entities/publication/publication_0.gz
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/log4j.properties
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/log4j.properties
@ -0,0 +1,47 @@
+# Root logger option
+log4j.rootLogger=DEBUG, stdout
+
+# Direct log messages to stdout
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target=System.out
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
+
+# Change this to set Spark log level
+log4j.logger.org.apache.spark=ERROR
+log4j.rootCategory=WARN
+
+# Silence akka remoting
+log4j.logger.Remoting=WARN
+
+# Ignore messages below warning level from Jetty, because it's a bit verbose
+log4j.logger.org.eclipse.jetty=WARN
+
+log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=WARN
+log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN
+#log4j.logger.org.apache.parquet.hadoop.ParquetOutputFormat=WARN
+#log4j.logger.org.apache.parquet.hadoop.InternalParquetRecordWriter=WARN
+log4j.logger.org.apache.hadoop.io.compress.CodecPool=WARN
+#log4j.logger.org.apache.hadoop.io.compress=WARN
+#log4j.logger.org.apache.parquet.hadoop.codec.CodecConfig=WARN
+log4j.logger.parquet.hadoop.ColumnChunkPageWriteStore=ERROR
+log4j.logger.com.jayway.jsonpath.internal.path.CompiledPath=WARN
+log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=ERROR
+log4j.logger.parquet.hadoop=WARN
+log4j.logger.org.eclipse.jetty.server.handler.ContextHandlerCollection=WARN
+log4j.logger.org.spark_project.jetty.util.component.ContainerLifeCycle=WARN
+log4j.logger.org.apache.hadoop.mapred.FileInputFormat=WARN
+log4j.logger.org.spark_project.jetty.servlet.ServletHandler=WARN
+log4j.logger.org.apache.commons.beanutils.converters.BooleanConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.StringConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.LongConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.ArrayConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.FloatConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.IntegerConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.DoubleConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.CharacterConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.ByteConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.BigIntegerConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.BigDecimalConverter=WARN
+log4j.logger.org.apache.commons.beanutils.converters.ShortConverter=WARN
+log4j.logger.org.apache.commons.beanutils.BeanUtils=WARN
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java
@ -26,6 +26,9 @@ public class QueryInformationSystem {
 		+ "  return  "
 		+ "  <community>  "
 		+ "  { $x//CONFIGURATION/context/@id}  "
+			+ " <advancedConstraints>" +
+			"{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }" +
+			"</advancedConstraints>"
 		+ "  <subjects>  "
 		+ "  {for $y in tokenize($subj,',')  "
 		+ "  return  "
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java
@ -9,16 +9,16 @@ import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;

+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import org.apache.commons.lang3.StringUtils;

 import com.google.gson.Gson;
 import com.jayway.jsonpath.DocumentContext;
 import com.jayway.jsonpath.JsonPath;

-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;

 /** Created by miriam on 02/08/2018. */
 public class ResultTagger implements Serializable {
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/StartsWithVerb.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/StartsWithVerb.java
@ -0,0 +1,30 @@
+
+package eu.dnetlib.dhp.bulktag.criteria;
+
+import java.io.Serializable;
+
+@VerbClass("starts_with")
+public class StartsWithVerb implements Selection, Serializable {
+
+	private String param;
+
+	public StartsWithVerb() {
+	}
+
+	public StartsWithVerb(final String param) {
+		this.param = param;
+	}
+
+	@Override
+	public boolean apply(String value) {
+		return value.startsWith(param);
+	}
+
+	public String getParam() {
+		return param;
+	}
+
+	public void setParam(String param) {
+		this.param = param;
+	}
+}
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java
@ -16,6 +16,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.ForeachFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
@ -45,7 +46,9 @@ public class BulkTagJobTest {
 		+ "  \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
 		+ "  \"contributor\" : \"$['contributor'][*]['value']\","
 		+ "  \"description\" : \"$['description'][*]['value']\", "
-		+ " \"subject\" :\"$['subject'][*]['value']\" }";
+		+ " \"subject\" :\"$['subject'][*]['value']\" , " +
+
+			"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='subject:fos')].value\"} ";

 	private static SparkSession spark;

@ -769,28 +772,14 @@ public class BulkTagJobTest {
 		org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);

 		idExplodeCommunity.show(false);
-		Assertions.assertEquals(4, idExplodeCommunity.count());
+		Assertions.assertEquals(5, idExplodeCommunity.count());

 		Assertions
 			.assertEquals(
 				3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
 		Assertions
 			.assertEquals(
-				1, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
+				2, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
 	}

-//	@Test
-//	void test1(){
-//		ProtoMap params = new Gson().fromJson(pathMap, ProtoMap.class);
-//		HashMap<String, String> param = new HashMap<>();
-//			for (String key : params.keySet()) {
-//				try {
-//					param.put(key, jsonContext.read(params.get(key)));
-//				} catch (com.jayway.jsonpath.PathNotFoundException e) {
-//					param.put(key, new ArrayList<>());
-//				}
-//			}
-//			return param;
-//		}
-//	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml
@ -844,6 +844,89 @@
    <organizations/>
 </community>
    <community id="dariah">
+        <advancedConstraints>
+            {
+            "criteria": [
+            {
+            "constraint": [
+            {
+            "verb": "equals_caseinsensitive",
+            "field": "subject",
+            "value": "North America"
+            },
+            {
+            "verb": "contains",
+            "field": "fos",
+            "value": "05"
+            }
+            ]
+            },
+            {
+            "constraint": [
+            {
+            "verb": "equals_caseinsensitive",
+            "field": "subject",
+            "value": "North America"
+            },
+            {
+            "verb": "contains",
+            "field": "fos",
+            "value": "06"
+            }
+            ]
+            },
+            {
+            "constraint": [
+            {
+            "verb": "equals_caseinsensitive",
+            "field": "subject",
+            "value": "Mexico"
+            },
+            {
+            "verb": "equals_caseinsensitive",
+            "field": "subject",
+            "value": "United States"
+            },
+            {
+            "verb": "equals_caseinsensitive",
+            "field": "subject",
+            "value": "Canada"
+            },
+            {
+            "verb": "contains",
+            "field": "fos",
+            "value": "05"
+            }
+            ]
+            },
+            {
+            "constraint": [
+            {
+            "verb": "equals_caseinsensitive",
+            "field": "subject",
+            "value": "Mexico"
+            },
+            {
+            "verb": "equals_caseinsensitive",
+            "field": "subject",
+            "value": "United States"
+            },
+            {
+            "verb": "equals_caseinsensitive",
+            "field": "subject",
+            "value": "Canada"
+            },
+            {
+            "verb": "contains",
+            "field": "fos",
+            "value": "06"
+            }
+            ]
+            }
+            ]
+            }
+
+        </advancedConstraints>
    <subjects/>
    <datasources>
        <datasource>
@ -1174,7 +1257,9 @@
    </zenodocommunities>
    <organizations/>
 </community>
+
    <community id="euromarine">
+
    <subjects/>
    <datasources/>
    <zenodocommunities/>
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints/dataset_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints/dataset_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json.gz
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json.gz
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java
@ -142,6 +142,26 @@ public class IndexRecordTransformerTest {
 		testRecordTransformation(record);
 	}

+	@Test
+	public void testForEOSCFutureZenodo7353841() throws IOException, TransformerException {
+		final String record = IOUtils
+				.toString(getClass().getResourceAsStream("eosc-future/zenodo7353841.xml"));
+		testRecordTransformation(record);
+	}
+
+	@Test
+	public void testForEOSCFutureZenodo7351393() throws IOException, TransformerException {
+		final String record = IOUtils
+				.toString(getClass().getResourceAsStream("eosc-future/zenodo7351393.xml"));
+		testRecordTransformation(record);
+	}
+
+	@Test
+	public void testForEOSCFutureZenodo7351221() throws IOException, TransformerException {
+		final String record = IOUtils
+				.toString(getClass().getResourceAsStream("eosc-future/zenodo7351221.xml"));
+		testRecordTransformation(record);
+	}
 	@Test
 	void testDoiUrlNormalization() throws MalformedURLException {

--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/zenodo7351221.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/zenodo7351221.xml
@ -0,0 +1,99 @@
+<record>
+    <result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        <header>
+            <dri:objIdentifier>doi_________::9cb0664d4c891c4baaf73f007c0c9de0</dri:objIdentifier>
+            <dri:dateOfCollection>2022-11-25T12:55:13Z</dri:dateOfCollection>
+            <dri:status>under curation</dri:status>
+            <counters />
+        </header>
+        <metadata>
+            <oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
+                        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+                        xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
+            <oaf:result>
+
+                    <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">COVID-19 Knowledge Graph: A semantic resource embedding biological and chemical entities</title>
+                    <creator rank="1" name="" surname="">Karki, Reagon</creator>
+                    <dateofacceptance />
+                    <resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
+                    <language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
+                    <description><p>A Knowledge graph representation of compounds and associated biological entities in the BY-COVID and EOSC Future project.</p> <p><strong>Current status</strong></p> <ul> <li>Number of Nodes: 35952</li> <li>Number of Edges: 279462</li> <li>Human Proteins: 1347</li> <li>Assay: 15835</li> <li>Chemical/Compound: 4096</li> <li>Mechanism of Action: 739</li> <li>Pathway: 1513</li> <li>Disease: 1585</li> <li>SideEffect: 7420</li> <li>Biological Process: 2085</li> <li>Molecular Function: 1332</li> </ul> <p>Please check the BY_COVID_update_August.ipynb for understanding step wise process of KG generation and KG statistics. The KG has been exported to formats such as graphml, sif and so on for visualizations in other platforms. For example, the graphml file can be imported to Cytoscape directly. These files are located in &#39;data\export&#39; folder.</p> <p></p></description>
+                    <country classid="" classname="" schemeid="" schemename="" />
+                    <subject classid="" classname="" schemeid="" schemename="" />
+                    <relevantdate classid="" classname="" schemeid="" schemename="" />
+                    <publisher>Zenodo</publisher>
+                    <embargoenddate />
+                    <journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
+                    <source />
+                    <fulltext />
+                    <format />
+                    <storagedate />
+                    <resourcetype classid="" classname="" schemeid="" schemename="" />
+                    <device />
+                    <size />
+                    <version />
+                    <lastmetadataupdate />
+                    <metadataversionnumber />
+                    <documentationUrl />
+                    <codeRepositoryUrl />
+                    <programmingLanguage classid="" classname="" schemeid="" schemename="" />
+                    <contactperson />
+                    <contactgroup />
+                    <tool />
+                    <originalId>oai:zenodo.org:7351221</originalId>
+                    <collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                    <pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7351221</pid>
+                    <pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7351221</pid>
+                    <bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
+                    <context id="covid-19" label="COVID-19" type="community"></context>
+                    <datainfo>
+                        <inferred>false</inferred>
+                        <deletedbyinference>false</deletedbyinference>
+                        <trust>0.9</trust>
+                        <inferenceprovenance />
+                        <provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
+                    </datainfo>
+                    <rels>
+                        <rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
+                            <to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
+                            <code>101017536</code>
+                            <acronym>EOSC Future</acronym>
+                            <title>EOSC Future</title>
+                            <contracttype classid="" classname="" schemeid="" schemename="" />
+                            <funding>
+                                <funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
+                                <funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
+                            </funding>
+                            <websiteurl />
+                        </rel>
+                        <rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
+                            <to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::4a3254eac2997eee0a9dcb7a7daedb81</to>
+                            <code>101046203</code>
+                            <acronym>BY-COVID</acronym>
+                            <title>Beyond COVID</title>
+                            <contracttype classid="" classname="" schemeid="" schemename="" />
+                            <funding>
+                                <funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
+                                <funding_level_0 name="Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based">ec__________::EC::Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based</funding_level_0>
+                            </funding>
+                            <websiteurl />
+                        </rel>
+                    </rels>
+                    <children>
+                        <instance id="od______2659::040cee965a4544e343a2ba149783c3fc">
+                            <instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
+                            <collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                            <hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                            <accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
+                            <dateofacceptance />
+                            <webresource>
+                                <url>https://zenodo.org/record/7351221</url>
+                            </webresource>
+                        </instance>
+                    </children>
+                </oaf:result>
+            </oaf:entity>
+        </metadata>
+    </result>
+</record>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/zenodo7351393.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/zenodo7351393.xml
@ -0,0 +1,100 @@
+<record>
+    <result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        <header>
+            <dri:objIdentifier>doi_________::07fdccabd77830e3caccf0b33c083f1b</dri:objIdentifier>
+            <dri:dateOfCollection>2022-11-25T01:08:31Z</dri:dateOfCollection>
+            <dri:status>under curation</dri:status>
+            <counters />
+        </header>
+        <metadata>
+            <oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
+                        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+                        xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
+                <oaf:result>
+                    <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Monkeypox Knowledge Graph: A semantic resource embedding biological and chemical entities</title>
+                    <creator rank="1" name="" surname="">Karki, Reagon</creator>
+                    <creator rank="2" name="" surname="">Andrea, Zaliani</creator>
+                    <creator rank="3" name="" surname="">Gadiya, Yojana</creator>
+                    <creator rank="4" name="" surname="">Gribbon, Philip</creator>
+                    <dateofacceptance />
+                    <resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
+                    <language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
+                    <description><p>The Monkeypox KG is built using viral and human proteins reported in different resources. Additionally, the KG represents chemicals tested against Monkeypox and their targets, associated biological processes, molecular functions, diseases and side effects.</p> <p><strong>KG status</strong></p> <p>Version 1 stats:</p> <ul> <li>Number of Nodes: 8235</li> <li>Number of Edges: 40422</li> </ul> <p>Version 2 stats (2nd September) :</p> <ul> <li>Number of Nodes: 9129</li> <li>Number of Edges: 44568</li> </ul> <p>Please check the graph.ipynb for understanding step wise process of KG generation and KG statistics. The KG has been exported to formats such as graphml, sif and so on for visualizations in other platforms. For example, the graphml file can be imported to Cytoscape directly. These files are located in &#39;data\export&#39; folder.</p> <p></p></description>
+                    <country classid="" classname="" schemeid="" schemename="" />
+                    <subject classid="" classname="" schemeid="" schemename="" />
+                    <relevantdate classid="" classname="" schemeid="" schemename="" />
+                    <publisher>Zenodo</publisher>
+                    <embargoenddate />
+                    <journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
+                    <source />
+                    <fulltext />
+                    <format />
+                    <storagedate />
+                    <resourcetype classid="" classname="" schemeid="" schemename="" />
+                    <device />
+                    <size />
+                    <version />
+                    <lastmetadataupdate />
+                    <metadataversionnumber />
+                    <documentationUrl />
+                    <codeRepositoryUrl />
+                    <programmingLanguage classid="" classname="" schemeid="" schemename="" />
+                    <contactperson />
+                    <contactgroup />
+                    <tool />
+                    <originalId>oai:zenodo.org:7351393</originalId>
+                    <collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                    <pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7351393</pid>
+                    <pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7351393</pid>
+                    <bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
+                    <datainfo>
+                        <inferred>false</inferred>
+                        <deletedbyinference>false</deletedbyinference>
+                        <trust>0.9</trust>
+                        <inferenceprovenance />
+                        <provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
+                    </datainfo>
+                    <rels>
+                        <rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
+                            <to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
+                            <code>101017536</code>
+                            <acronym>EOSC Future</acronym>
+                            <title>EOSC Future</title>
+                            <contracttype classid="" classname="" schemeid="" schemename="" />
+                            <funding>
+                                <funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
+                                <funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
+                            </funding>
+                            <websiteurl />
+                        </rel>
+                        <rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
+                            <to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::4a3254eac2997eee0a9dcb7a7daedb81</to>
+                            <code>101046203</code>
+                            <acronym>BY-COVID</acronym>
+                            <title>Beyond COVID</title>
+                            <contracttype classid="" classname="" schemeid="" schemename="" />
+                            <funding>
+                                <funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
+                                <funding_level_0 name="Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based">ec__________::EC::Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based</funding_level_0>
+                            </funding>
+                            <websiteurl />
+                        </rel>
+                    </rels>
+                    <children>
+                        <instance id="od______2659::db2bc6381545f80dc9feec808a173ec0">
+                            <instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
+                            <collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                            <hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                            <accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
+                            <dateofacceptance />
+                            <webresource>
+                                <url>https://zenodo.org/record/7351393</url>
+                            </webresource>
+                        </instance>
+                    </children>
+                </oaf:result>
+            </oaf:entity>
+        </metadata>
+    </result>
+</record>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/zenodo7353841.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/zenodo7353841.xml
@ -0,0 +1,85 @@
+<record>
+    <result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        <header>
+            <dri:objIdentifier>doi_________::93d39dd7edef016928788c3500e149f1</dri:objIdentifier>
+            <dri:dateOfCollection>2022-11-24T08:41:37Z</dri:dateOfCollection>
+            <dri:status>under curation</dri:status>
+            <counters/>
+        </header>
+        <metadata>
+            <oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
+                        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+                        xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
+                <oaf:result>
+                    <title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">ENVRI SP - Dashboard State of the Environment - Ocean Indicators</title>
+                    <creator rank="1" name="" surname="">Tjerk Krijger</creator>
+                    <dateofacceptance />
+                    <resulttype classid="other" classname="other" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
+                    <language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
+                    <description><p>The attached .yaml file is used as input to the Dashboard State of the Environment, which is a science project of the ENVRI-FAIR science cluster within EOSC-FUTURE. The contents of the file enable the visualization of  Ocean indicators on the dashboard. It is possible to download the attached file and change the contents to include indicators from different domains such as atmosphere or biodiversity.</p></description>
+                    <country classid="" classname="" schemeid="" schemename="" />
+                    <subject classid="" classname="" schemeid="" schemename="" />
+                    <relevantdate classid="" classname="" schemeid="" schemename="" />
+                    <publisher>Zenodo</publisher>
+                    <embargoenddate />
+                    <journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
+                    <source />
+                    <fulltext />
+                    <format />
+                    <storagedate />
+                    <resourcetype classid="" classname="" schemeid="" schemename="" />
+                    <device />
+                    <size />
+                    <version />
+                    <lastmetadataupdate />
+                    <metadataversionnumber />
+                    <documentationUrl />
+                    <codeRepositoryUrl />
+                    <programmingLanguage classid="" classname="" schemeid="" schemename="" />
+                    <contactperson />
+                    <contactgroup />
+                    <tool />
+                    <originalId>oai:zenodo.org:7353841</originalId>
+                    <collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                    <pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7353841</pid>
+                    <pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7353841</pid>
+                    <bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
+                    <datainfo>
+                        <inferred>false</inferred>
+                        <deletedbyinference>false</deletedbyinference>
+                        <trust>0.9</trust>
+                        <inferenceprovenance />
+                        <provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
+                    </datainfo>
+                    <rels>
+                        <rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
+                            <to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
+                            <code>101017536</code>
+                            <acronym>EOSC Future</acronym>
+                            <title>EOSC Future</title>
+                            <contracttype classid="" classname="" schemeid="" schemename="" />
+                            <funding>
+                                <funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
+                                <funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
+                            </funding>
+                            <websiteurl />
+                        </rel>
+                    </rels>
+                    <children>
+                        <instance id="od______2659::3e4323c221f269e5f3d6db4c61dd2ec8">
+                            <instancetype classid="0020" classname="Other ORP type" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
+                            <collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                            <hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
+                            <accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
+                            <dateofacceptance />
+                            <webresource>
+                                <url>https://zenodo.org/record/7353841</url>
+                            </webresource>
+                        </instance>
+                    </children>
+                </oaf:result>
+            </oaf:entity>
+        </metadata>
+    </result>
+</record>
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql
@ -48,7 +48,9 @@ create table TARGET.result stored as parquet as
             'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
             'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
             'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
-             'openorgs____::e15adb13c4dadd49de4d35c39b5da93a'  -- Nanyang Technological University
+             'openorgs____::e15adb13c4dadd49de4d35c39b5da93a',  -- Nanyang Technological University
+             'openorgs____::4b34103bde246228fcd837f5f1bf4212',  -- Autonomous University of Barcelona
+             'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb'	-- McMaster University
        ) )) foo;
 compute stats TARGET.result;