From 0aa725083f66875b4eef7ad6513b24bf557eb5ec Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 17 Nov 2022 16:13:43 +0100 Subject: [PATCH] extended dedup testing --- .../dhp/oa/dedup/DedupRecordFactory.java | 2 +- ...st.java => SparkPublicationRootsTest.java} | 283 +++++++++--------- .../oa/dedup/SparkPublicationRootsTest2.java | 251 ++++++++++++++++ .../entities2/publication/publication.gz | Bin 9056 -> 0 bytes .../alterations/publication/publication_1.gz | Bin 0 -> 1488 bytes .../entities/publication/publication_0.gz | Bin 0 -> 10874 bytes 6 files changed, 400 insertions(+), 136 deletions(-) rename dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/{SparkDedupPublicationTest.java => SparkPublicationRootsTest.java} (57%) create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java delete mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/entities2/publication/publication.gz create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz create mode 100644 dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/entities/publication/publication_0.gz diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index f9fc8a21a..82bf87cca 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -112,7 +112,7 @@ public class DedupRecordFactory { // set authors and date if (ModelSupport.isSubClass(entity, Result.class)) { - ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); + // ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); ((Result) entity).setAuthor(AuthorMerger.merge(authors)); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupPublicationTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java similarity index 57% rename from dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupPublicationTest.java rename to dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java index 773de65fa..3cff836eb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupPublicationTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java @@ -11,16 +11,17 @@ import java.io.File; import java.io.IOException; import java.io.Serializable; import java.net.URISyntaxException; +import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.cli.ParseException; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -31,7 +32,6 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Sets; @@ -44,48 +44,52 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) -public class SparkDedupPublicationTest implements Serializable { +public class SparkPublicationRootsTest implements Serializable { @Mock(serializable = true) ISLookUpService isLookUpService; private static SparkSession spark; - private static JavaSparkContext jsc; + private static String workingPath; - private static String testGraphBasePath; - private static String testOutputBasePath; - private static String testDedupGraphBasePath; + private static String graphInputPath; + private static String graphOutputPath; private static final String testActionSetId = "test-orchestrator"; + private static Path testBaseTmpPath; + + private static final ObjectMapper MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + @BeforeAll - public static void cleanUp() throws IOException, URISyntaxException { + public static void init() throws IOException, URISyntaxException { - testGraphBasePath = Paths - .get(SparkDedupPublicationTest.class.getResource("/eu/dnetlib/dhp/dedup/entities2").toURI()) - .toFile() - .getAbsolutePath(); - testOutputBasePath = createTempDirectory(SparkDedupPublicationTest.class.getSimpleName() + "-") - .toAbsolutePath() - .toString(); + testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest.class.getSimpleName() + "-"); - testDedupGraphBasePath = createTempDirectory(SparkDedupPublicationTest.class.getSimpleName() + "-") - .toAbsolutePath() - .toString(); + final File entitiesSources = Paths + .get(SparkPublicationRootsTest.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI()) + .toFile(); - FileUtils.deleteDirectory(new File(testOutputBasePath)); - FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); + FileUtils + .copyDirectory( + entitiesSources, + testBaseTmpPath.resolve("input").toFile()); + + workingPath = testBaseTmpPath.resolve("workingPath").toString(); + graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString(); + graphOutputPath = testBaseTmpPath.resolve("output").toString(); + + FileUtils.deleteDirectory(new File(workingPath)); + FileUtils.deleteDirectory(new File(graphOutputPath)); final SparkConf conf = new SparkConf(); conf.set("spark.sql.shuffle.partitions", "10"); spark = SparkSession .builder() - .appName(SparkDedupPublicationTest.class.getSimpleName()) + .appName(SparkPublicationRootsTest.class.getSimpleName()) .master("local[*]") .config(conf) .getOrCreate(); - - jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } @BeforeEach @@ -100,55 +104,51 @@ public class SparkDedupPublicationTest implements Serializable { .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")); } + @AfterAll + public static void tearDown() throws IOException { + FileUtils.deleteDirectory(testBaseTmpPath.toFile()); + spark.close(); + } + @Test @Order(1) void createSimRelsTest() throws Exception { - - ArgumentApplicationParser parser = new ArgumentApplicationParser( - classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")); - - parser - .parseArgument( - new String[] { - "--graphBasePath", testGraphBasePath, - "--actionSetId", testActionSetId, - "--isLookUpUrl", "lookupurl", - "--workingPath", testOutputBasePath, - "--numPartitions", "5" - }); - - new SparkCreateSimRels(parser, spark).run(isLookUpService); + new SparkCreateSimRels(args( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json", + new String[] { + "--graphBasePath", graphInputPath, + "--actionSetId", testActionSetId, + "--isLookUpUrl", "lookupurl", + "--workingPath", workingPath, + "--numPartitions", "5" + }), spark) + .run(isLookUpService); long pubs_simrel = spark .read() - .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "publication")) + .load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication")) .count(); - assertEquals(62, pubs_simrel); + assertEquals(74, pubs_simrel); } @Test @Order(2) void cutMergeRelsTest() throws Exception { - - ArgumentApplicationParser parser = new ArgumentApplicationParser( - classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")); - - parser - .parseArgument( - new String[] { - "--graphBasePath", testGraphBasePath, - "--actionSetId", testActionSetId, - "--isLookUpUrl", "lookupurl", - "--workingPath", testOutputBasePath, - "--cutConnectedComponent", "3" - }); - - new SparkCreateMergeRels(parser, spark).run(isLookUpService); + new SparkCreateMergeRels(args( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json", + new String[] { + "--graphBasePath", graphInputPath, + "--actionSetId", testActionSetId, + "--isLookUpUrl", "lookupurl", + "--workingPath", workingPath, + "--cutConnectedComponent", "3" + }), spark) + .run(isLookUpService); long pubs_mergerel = spark .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .load(workingPath + "/" + testActionSetId + "/publication_mergerel") .as(Encoders.bean(Relation.class)) .filter((FilterFunction) r -> r.getRelClass().equalsIgnoreCase("merges")) .groupBy("source") @@ -159,49 +159,44 @@ public class SparkDedupPublicationTest implements Serializable { assertEquals(0, pubs_mergerel); - FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")); + FileUtils.deleteDirectory(new File(workingPath + "/" + testActionSetId + "/publication_mergerel")); } @Test @Order(3) void createMergeRelsTest() throws Exception { + new SparkCreateMergeRels(args( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json", + new String[] { + "--graphBasePath", graphInputPath, + "--actionSetId", testActionSetId, + "--isLookUpUrl", "lookupurl", + "--workingPath", workingPath + }), spark) + .run(isLookUpService); - ArgumentApplicationParser parser = new ArgumentApplicationParser( - classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")); - - parser - .parseArgument( - new String[] { - "--graphBasePath", testGraphBasePath, - "--actionSetId", testActionSetId, - "--isLookUpUrl", "lookupurl", - "--workingPath", testOutputBasePath - }); - - new SparkCreateMergeRels(parser, spark).run(isLookUpService); - - final Dataset pubs = spark + final Dataset merges = spark .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .load(workingPath + "/" + testActionSetId + "/publication_mergerel") .as(Encoders.bean(Relation.class)); - final List merges = pubs + final List mergeList = merges .filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'") .collectAsList(); - assertEquals(3, merges.size()); + assertEquals(3, mergeList.size()); Set dups = Sets .newHashSet( "50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73", "50|doi_________::d5021b53204e4fdeab6ff5d5bc468032", "50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c"); - merges.forEach(r -> { + mergeList.forEach(r -> { assertEquals(ModelConstants.RESULT_RESULT, r.getRelType()); assertEquals(ModelConstants.DEDUP, r.getSubRelType()); assertEquals(ModelConstants.MERGES, r.getRelClass()); assertTrue(dups.contains(r.getTarget())); }); - final List mergedIn = pubs + final List mergedIn = merges .filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'") .collectAsList(); assertEquals(3, mergedIn.size()); @@ -212,47 +207,37 @@ public class SparkDedupPublicationTest implements Serializable { assertTrue(dups.contains(r.getSource())); }); - assertEquals(24, pubs.count()); + assertEquals(32, merges.count()); } @Test @Order(4) void createDedupRecordTest() throws Exception { - - ArgumentApplicationParser parser = new ArgumentApplicationParser( - classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")); - parser - .parseArgument( - new String[] { - "--graphBasePath", testGraphBasePath, - "--actionSetId", testActionSetId, - "--isLookUpUrl", "lookupurl", - "--workingPath", testOutputBasePath - }); - - new SparkCreateDedupRecord(parser, spark).run(isLookUpService); - - final ObjectMapper mapper = new ObjectMapper() - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + new SparkCreateDedupRecord(args( + "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json", + new String[] { + "--graphBasePath", graphInputPath, + "--actionSetId", testActionSetId, + "--isLookUpUrl", "lookupurl", + "--workingPath", workingPath + }), spark) + .run(isLookUpService); final Dataset roots = spark .read() - .textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord") - .map( - (MapFunction) value -> mapper.readValue(value, Publication.class), - Encoders.bean(Publication.class)); + .textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord") + .map(asEntity(Publication.class), Encoders.bean(Publication.class)); - assertEquals(2, roots.count()); + assertEquals(3, roots.count()); final Dataset pubs = spark .read() - .textFile(DedupUtility.createEntityPath(testGraphBasePath, "publication")) - .map( - (MapFunction) value -> mapper.readValue(value, Publication.class), - Encoders.bean(Publication.class)); + .textFile(DedupUtility.createEntityPath(graphInputPath, "publication")) + .map(asEntity(Publication.class), Encoders.bean(Publication.class)); verifyRoot_case_1(roots, pubs); verifyRoot_case_2(roots, pubs); + verifyRoot_case_3(roots, pubs); } private static void verifyRoot_case_1(Dataset roots, Dataset pubs) { @@ -299,8 +284,7 @@ public class SparkDedupPublicationTest implements Serializable { assertEquals("Article", instance_cr.get().getInstancetype().getClassname()); } - private void verifyRoot_case_2(Dataset roots, Dataset pubs) - throws JsonProcessingException { + private void verifyRoot_case_2(Dataset roots, Dataset pubs) { Publication root = roots .filter("id = '50|doi_dedup___::18aff3b55fb6876466a5d4bd82434885'") .first(); @@ -334,57 +318,86 @@ public class SparkDedupPublicationTest implements Serializable { assertTrue(Sets.difference(root_cf, dups_cf).isEmpty()); } + private void verifyRoot_case_3(Dataset roots, Dataset pubs) { + Publication root = roots + .filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'") + .first(); + assertNotNull(root); + + Publication pivot_duplicate = pubs + .filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'") + .first(); + + assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue()); + + Set dups_cf = pubs + .collectAsList() + .stream() + .flatMap(p -> p.getCollectedfrom().stream()) + .map(KeyValue::getValue) + .collect(Collectors.toCollection(HashSet::new)); + + Set root_cf = root + .getCollectedfrom() + .stream() + .map(KeyValue::getValue) + .collect(Collectors.toCollection(HashSet::new)); + + assertTrue(Sets.difference(root_cf, dups_cf).isEmpty()); + } + @Test @Order(6) void updateEntityTest() throws Exception { + new SparkUpdateEntity(args( + "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json", + new String[] { + "--graphBasePath", graphInputPath, + "--workingPath", workingPath, + "--dedupGraphPath", graphOutputPath + }), spark) + .run(isLookUpService); - ArgumentApplicationParser parser = new ArgumentApplicationParser( - classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")); - parser - .parseArgument( - new String[] { - "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath - }); - - new SparkUpdateEntity(parser, spark).run(isLookUpService); - - long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count(); + long publications = spark.read().textFile(graphOutputPath + "/publication").count(); long mergedPubs = spark .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .load(workingPath + "/" + testActionSetId + "/publication_mergerel") .as(Encoders.bean(Relation.class)) .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) + .map((MapFunction) Relation::getTarget, Encoders.STRING()) .distinct() .count(); - assertEquals(14, publications); + assertEquals(19, publications); // 16 originals + 3 roots - long deletedPubs = jsc - .textFile(testDedupGraphBasePath + "/publication") - .filter(this::isDeletedByInference) + long deletedPubs = spark + .read() + .textFile(graphOutputPath + "/publication") + .map(asEntity(Publication.class), Encoders.bean(Publication.class)) + .filter("datainfo.deletedbyinference == true") + .map((MapFunction) OafEntity::getId, Encoders.STRING()) + .distinct() .count(); assertEquals(mergedPubs, deletedPubs); } - @AfterAll - public static void finalCleanUp() throws IOException { - FileUtils.deleteDirectory(new File(testOutputBasePath)); - FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); - } - - public boolean isDeletedByInference(String s) { - return s.contains("\"deletedbyinference\":true"); - } - private static String classPathResourceAsString(String path) throws IOException { return IOUtils .toString( - SparkDedupPublicationTest.class + SparkPublicationRootsTest.class .getResourceAsStream(path)); } + private static MapFunction asEntity(Class clazz) { + return value -> MAPPER.readValue(value, clazz); + } + + private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException { + ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs)); + parser.parseArgument(args); + return parser; + } + } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java new file mode 100644 index 000000000..9afe1e34b --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java @@ -0,0 +1,251 @@ + +package eu.dnetlib.dhp.oa.dedup; + +import static java.nio.file.Files.createTempDirectory; + +import static org.apache.spark.sql.functions.count; +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.lenient; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.junit.jupiter.MockitoExtension; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Sets; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +@ExtendWith(MockitoExtension.class) +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +public class SparkPublicationRootsTest2 implements Serializable { + + @Mock(serializable = true) + ISLookUpService isLookUpService; + private static SparkSession spark; + + private static String workingPath; + + private static String graphInputPath; + + private static String graphOutputPath; + + private static final String testActionSetId = "test-orchestrator"; + + private static Path testBaseTmpPath; + + private static final ObjectMapper MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + @BeforeAll + public static void init() throws IOException, URISyntaxException { + + testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest2.class.getSimpleName() + "-"); + + final File entitiesSources = Paths + .get(SparkPublicationRootsTest2.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI()) + .toFile(); + + FileUtils + .copyDirectory( + entitiesSources, + testBaseTmpPath.resolve("input").toFile()); + + FileUtils + .copyFileToDirectory( + Paths + .get( + SparkPublicationRootsTest2.class + .getResource( + "/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz") + .toURI()) + .toFile(), + testBaseTmpPath.resolve("input").resolve("entities").resolve("publication").toFile()); + + workingPath = testBaseTmpPath.resolve("workingPath").toString(); + graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString(); + graphOutputPath = testBaseTmpPath.resolve("output").toString(); + + final SparkConf conf = new SparkConf(); + conf.set("spark.sql.shuffle.partitions", "10"); + spark = SparkSession + .builder() + .appName(SparkPublicationRootsTest2.class.getSimpleName()) + .master("local[*]") + .config(conf) + .getOrCreate(); + } + + @BeforeEach + public void setUp() throws IOException, ISLookUpException { + + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) + .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml")); + + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) + .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")); + } + + @AfterAll + public static void tearDown() throws IOException { + FileUtils.deleteDirectory(testBaseTmpPath.toFile()); + } + + @Test + @Order(7) + void dedupAlteredDatasetTest() throws Exception { + + new SparkCreateSimRels(args( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json", + new String[] { + "--graphBasePath", graphInputPath, + "--actionSetId", testActionSetId, + "--isLookUpUrl", "lookupurl", + "--workingPath", workingPath, + "--numPartitions", "5" + }), spark) + .run(isLookUpService); + + new SparkCreateMergeRels(args( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json", + new String[] { + "--graphBasePath", graphInputPath, + "--actionSetId", testActionSetId, + "--isLookUpUrl", "lookupurl", + "--workingPath", workingPath + }), spark) + .run(isLookUpService); + + final Dataset merges = spark + .read() + .load(workingPath + "/" + testActionSetId + "/publication_mergerel") + .as(Encoders.bean(Relation.class)); + + assertEquals( + 3, merges + .filter("relclass == 'isMergedIn'") + .map((MapFunction) Relation::getTarget, Encoders.STRING()) + .distinct() + .count()); + assertEquals( + 4, merges + .filter("source == '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'") + .count()); + + new SparkCreateDedupRecord(args( + "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json", + new String[] { + "--graphBasePath", graphInputPath, + "--actionSetId", testActionSetId, + "--isLookUpUrl", "lookupurl", + "--workingPath", workingPath + }), spark) + .run(isLookUpService); + + final Dataset roots = spark + .read() + .textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord") + .map(asEntity(Publication.class), Encoders.bean(Publication.class)); + + assertEquals(3, roots.count()); + + final Dataset pubs = spark + .read() + .textFile(DedupUtility.createEntityPath(graphInputPath, "publication")) + .map(asEntity(Publication.class), Encoders.bean(Publication.class)); + + Publication root = roots + .filter("id = '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'") + .first(); + assertNotNull(root); + + Publication crossref_duplicate = pubs + .filter("id = '50|doi_________::b3aec7985136e36827176aaa1dd5082d'") + .collectAsList() + .get(0); + + assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue()); + assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName()); + assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted()); + assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue()); + + Set rootPids = root + .getPid() + .stream() + .map(StructuredProperty::getValue) + .collect(Collectors.toCollection(HashSet::new)); + Set dupPids = crossref_duplicate + .getPid() + .stream() + .map(StructuredProperty::getValue) + .collect(Collectors.toCollection(HashSet::new)); + + assertFalse(Sets.intersection(rootPids, dupPids).isEmpty()); + assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716")); + assertTrue(rootPids.contains("10.1109/jstqe.2023.9999999")); + + Optional instance_cr = root + .getInstance() + .stream() + .filter(i -> i.getCollectedfrom().getValue().equals("Crossref")) + .findFirst(); + assertTrue(instance_cr.isPresent()); + assertEquals("OPEN", instance_cr.get().getAccessright().getClassid()); + assertEquals("Open Access", instance_cr.get().getAccessright().getClassname()); + assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute()); + assertEquals( + "IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue()); + assertEquals("0001", instance_cr.get().getInstancetype().getClassid()); + assertEquals("Article", instance_cr.get().getInstancetype().getClassname()); + + } + + private static String classPathResourceAsString(String path) throws IOException { + return IOUtils + .toString( + SparkPublicationRootsTest2.class + .getResourceAsStream(path)); + } + + private static MapFunction asEntity(Class clazz) { + return value -> MAPPER.readValue(value, clazz); + } + + private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException { + ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs)); + parser.parseArgument(args); + return parser; + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/entities2/publication/publication.gz b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/entities2/publication/publication.gz deleted file mode 100644 index 96dd218174dbb1fa6ab55da289dc116bbceb9963..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9056 zcmV-mBcI$KiwFp`FK%N118{X>Y-wX*bZKvHE_8Tw0PTJ2a@$Cf?*H=?6^_^mIffV9 zZ@oPy)Uw*G9b3}MvU|_YnNSpff(RP~H~=A8?d^Bp%qoxsNRXl=Qlg~lu-g(*tV?EA zR#xVhi63u9rJuybfaPyI^5e}FO-cqoy`lMk#aD0s?2tDh9n+&U%J}m*y^4$2Pq0ly zlY&8mQJ!6~l%@fr!8p!R-Y`gLQN$s3RooQuaFpd^PdqQ!_zirGw@B%b5BD3*uUIi= zAwDUB0UI*$_$PxyF`#1a< zgAU*!e2?MxeV7;x0#+1xd^s4;FYoj5-;YmzKVDc;nvDs)qH#izTy4t|on8#HkS$wL zqX|wXq3LBwFWLMo$~0VHrYQ4C%ad!C zc8}S$;L7CFcQm>Q`KTj4;6MHeKjt+320yFVSro-dOe=ZePopwP7CY^bswqp;*Y*MR+Xv*`b%^L>VXQJA z(=e-#wx5o(G|t+`Gu9YSblD-3j{fA|XGK=#0UO_p76faVc-KB(kSkOf2l1F)Ogdg7 zUyJQf&NA3$aY~aTzP>lA(lZ>>RQfdk9$)n|O>ugPr5pB}fAXDM1bIA~thc7Yz5plV zfF|7odKHgv$fq)oXu!IM!+1Q7(@XM!7A!Bw-j{<8B#SMphmKM$hm-h;U31$KogGH z5JWmG85w8TVVsk+9I`ws3mB`2bJQ02oe@qS1}@5AKxjdd?3#?SYnJmVltaGc4xcaq z^-v~av|_|q*nu{~5op^K2JR4=hJ?XlalS*EmOzGanZty7M9eC0shB>u)=)c6}N5WYM@1rLJ`ao+r)A7@E+iL4OR z!aePmpd81fOvX8lD2!N?eit-%RxnszNOIT^#ZNE^AWIg3PR8CZa|Yi5+Q1@%vET`= zzsna5gMoJLf`kDD!zN?)aDuGs0};Zv6OLqg(IaOpVCk4#XZg2+4CocCWsqH`eEB7i z4F8?1F1!Khje`Q{G@%DhVMND3wiFg0WS42o*?=UEhoCy^?1vv=YQq^%MGM>o1kRi+ z#73N1;U()vu-Fnr5_LEiw;tIKL*6}2lAF$C3VhOVj%CDWEX7HOI2BkG2r>+kW@!Q^ z0I(MhIoN<>KL4qx0?Sx_mBl%inUe2}y|aV2Fu9Rv&jky3b`!INPmz5OSdt*w;9wkc zvEtH_j|qw`6_Y(5fD|EJQLJ*1P~1j*b>E>;%zK7)!1{)KPa(&r+boK65Q;Ih;=iWs z8pS?fqcM&~$@{?{UHryu9TO%Av4nrdeC)kX$H#9$+~Ne79FKwQ1bG2_jFYdDNSrML zQlOOrd z*NyU+r6GPBMghL01uPeqK$G|nG0@&}+Tp5Tb9Q0iE?AIB1Z?x6(+qS|z?YDvNU^t_ z`K`M5{AM;$W0R9YdYEU`$to*^n@iaJ;_uB7IKM76@H51OIQJDt|L&^VBm#(2%r}nSVbj2 zM~8=pV8Eayk>Gy$1Uo-K!9LGMmHztA5~QXal0$rwisdFE=N+(&WfAPYw3u#c{F< zMO{q(^E};GLnU;W6S=zLXqIJ}VdRFE?`VRt2OpOh449Wh$y0!TK!QP^oR2 zx@!hb@TN}e&*`@mEu=FzEsJ=Z`GV*Y5{txxvW-X96CAy)3`% zS6?Xl{!QQLDgA_9(gchkvEj=+;nx48hwyt3IKjWT`a&Q#>T}x>{j+i!fSLg#0-1J;y}+%LakWbt(`-ChZY}UJ zZZQ^kn})Orfw4fp3xjIu{K^r2d2x2N&$lx-=k~cZWT`Q`)l#;R`As(4iur-nS+C8>*5(N=+~1^9yzF- zEW+hhyR{cVj0RC5*SFK_JLS>{hUd#&&p-eCKR@P=gjahznM74r#hf1BqOD&Xck~Dw6-Pw}m22sW;#<^W zdZ;;?PeWY|e2Zxfqw*lnKxkMbD6*_e2d?dw64fBWNX2y4Nmeav8OJftj+!zu_ZEKnf`(C1j(uQAF8%4#@*PE^Y>EJe`_MYB1a zp@uoB0Uqj*1_2KtO&H1@@F^2e3a7jD9^URf%<=$^DbNNO3~oO}&RD@{9t36ntb!{Y#-m`>@qIK`OTBDQ{4hTcae|t zJ6%AsNCWt_J|Jv@0+ajv8SitwYZ$KHbq&p0ln;)gBU6OlTByAVR8LC9GnNVsq{48z zuA&=K`#)pt-{9l??yeB4q=t%J8!I~j$cih-Gl&Tb;m-p2%iI}_2>$bP#M~Muncm=AF7R8pzpn~luj2y3 zX^7v>vT`iM;l|J7b{+RZ0^3W2V=5>q^;|JL)Alswzu!~}s^Qjc!qK@7;~x9TZ(qg6 z6bBQm>J)c4*FL~@+wN+X@?Ui&cF*060+A(r=3F}zBR&E@I$+cjjwsesjbe$8Rutk60PZs!y&3I)6BV;>7F}l!xxD= z{rzFM)7#=k_pxI08Eo${E!d^8pmt+++G^Gt4j@e5MRUG!-MKBg^!OP~?f1&(&bKhX z&u1SdWBi-G|8Lr_@P~azLFx(EC5!?eLI3>FM=d>Zzt>@DXgwyF4;I#T>ddP4I0-yX zdhF4w)2uD>w^qz5%xEqSAuBRmO^XcEv>eq^sS|{eK|@V*-N*_xyVECik8_I=>?-Qnof6le2E;5JB? zwBo>8986LYUX&zMAxjW|wc?eQsyVu9+6qtizCB*qlG@&RaKLhYki|sy<{0;gmCO&( zNL_FCem8gfLdWwt>T{iM!PBQL$YGkLr*H{9;|{weTXEwVIh#NII!!Me*c?+6piljx zzgF^$)NjwmPz^CQfrctjB*}tls@v3>OJOG-)vdV6t#a|EE@fan8Ca5Ua;txLE4zIc zFn_Cky29zyn8Yba%2L?emYuro_oxNG?ky>#q6P0$cid32go`ZU zB1^c)5-zfYi!9;tEK9h^6aS7U{y{dx9qN%st$EW7(eFy?%-R9>Wx;XQ0?<2z%^w#7 z85c*F$kG5ther#taFU>+I(SJ7!Puv8Npnr#vF*qWG{taT&2)X=v4Xj%xV%?NZ}Q>J!YOu*cTb@M24K3j?El}U zA8!mi>)1x$P#sk_w}Yx!29-eErgN_7p6+^v(=$wKB~}9Oe&{gjrCHbuX(^W!)`4?`0IHQGcch;GecJ~l*n?F(6LknezRRh z)n-MbO~Z1cNDo5GfOEz$R7JC0M^s4KENVwe z+)m>)-m?_$$;mxDE^%BeEa<2rD)Da!YQ+7cv%@E3Y~7k$40aZ*u!_@krdiieXu79gvQ#Yo_CL>L8uxk`5MT z^eYvKFTikHu@&7>6<DNt=%|c0}`h7|xeF@anTo@$vK7v3L9n{Qcm4KJwtEiyOFG-@w~(Lcb#RwqAl-?m!#90(G{L zy?dD@uQzwM#RqgJoR{cBH9b}HG`;6;3+DA2_ttL}YKwwte#tNbK(At&Vh|r%q<>Jd z_uJ|I(OZn;UdIVs&te1v#E9h>(yv)g*d$vcj0=QH3CIQ|0WAe?Do(M40S5hth`8Xe zsE~uG2y71@o_#9>isXQoI3Rxc#~(Bg$=)xpdolz(vFt4w(1N2oQs5vHSg&A!3+J$w z;9q6TyF@4lQYV;#i4a-9K&lu__=ZGfT0JTdUezt|luubQ#m|fog;I6PFFBK?!&$>B zY!2LZ!W%;2t`lMme46*nI}l&0BZ37*iXsbQtf(lcJpp#%A}$#p1;0J!&Lo6x&5;9f zWpoVvLI*q?gM1;snbS#5)?CpRJA>_`bm4z0VWaquECV&r!!SIE42fW!0}djhVW-(y zmq+Ii&N4rRYsMiUuxCaEOr%GC;f#eQXXoA1gEL@$l2kIn+X~bNE)c6TT4&GD6r1Z%sf4adGCbA59bZ8-@ri;IuA&|H{6WX*sMdAEaW8EWN4l zYr`_l_-CjAc;Om>G8)1~lZa9t6|HZRiy6ts@gE#RhTDFC`v@d)&+V(KSeZZq)msj4 zvH;j78rrjlVz&v^fz<%Yq!dKD&MMv^k`E@dFhkyZ5f=!03i(-pk_OTc7ehhjxTpkj zvY)~jR>6e{)O#@Qk&`kfG(^`s2TNw8IyMm1m@q{7fNF_T&?X$QR#3!qp@0WT$w72} zPGiJxB2zjffvzi@SOE=L$^~i6QB4y8;53{HwV#0i_&1fv@^9i~tHpvk07Y0|G>ndl zy4MC;Sy5L)Qw)qtXp?MDc{r_kI8CP1;zEI^OaM72Jyzv5g!iRcZlnHlt@H-(WHg-- zJo^x*81&{J;RNnof`es65J#6KoKQI0v;&D0dH_;r!*ROKePSDrg)w?XyL{R8Zj(5p zS47TT#2TGe_KYx!;CvOoVD1M3^ufp(+u5N4#wBmME=UmmXs+p;87_`F5oS%jxFk^b~3pN-v8Zgr}9N+3XOt-s+qlI0^ z3GHrZxRI?Ix}ipjgtWYZEX4_g|sSXVd)qfbI zMWP&D5z4_;EK6~lrAQ>op;m9J2(nRG^nz^Ik5kMj>toVq9`>N^|1E;i&yJ~Tx~kcl zW&8ybYg3jVz5Chuf4_dnF2Dc%j(CY05<&*kUqHAOy7N*W{w7`-d#c%8&wh~+^_Ps7 zkZzcbyJ0H3ti>#oQTE-Bvd>>y?-yk^3{TT~rYXbhGR!W+?CXZv|9`0|sxD*gGS)6* z?K0N>Y)+_$#@c15{iTH3@8unR^pJ!M#6Q{%PzK^aFUG;}qSoE*-Ws&$sRR_H-3eST9EqE_{heto8Ia?BWH!wCMifMmqFEUuJZ~H zP|PG`iWy+yMviKaGu*=MqyCz`n1uo>fh%5*EC$x6%W$oQ#+dgm^M< zhWwM%T1u@iCm%k4Jp5VJemeT}=`;NB)2H+Ov-8va_xLk>cKq{{jz66roqss|xh=C# zdU+n=9Ch}cLo0T|; zhLmW?XO4zc71hwJj;tb7hmCQYX@l=d;wDH9TPD@-GdI#@d8cji^$Edcg2l3m(}p;? zlPJ&ioXV2rD_JhzmGnYDhb=y!`&4mKJ;U@Ar>Co~tl}iAI7zsigv&{|oP^8C0!}aP zaGFd>zAQ;X`!*k_>)a=}(F6j@``s4rcd|y4tkJZakfSF;qwNsf=%to#;u)zzNfbO5 zbFzQb*Ln(8UAhQ^{xXP*qINaznoumy)O)7pw$^l-Vq7qwg#B|!OWdO~$4m>VMdh`k zG!MErjkQZURU`DPzP-?hY-UlTszsrxhPw@1xfS#P=Q!AA6F|Su zn5k2$t6`u8zHjPIpeljpTV_N}M{9cg-kt3KN4N5g74z7KM&wWTL$nMuU)QZDRG4qL zOjE-!a3U}nf-PVlc1hJp+(Y9~ z_Wgom5r{-Ty>#@Ge21NcKuHMH%YZ<=l4I)8;7``Z@lV`RDhAbsMKb)3cF8fKA{aRZ zDnwZlXZ)$el(7}Mm^fO892EsoJLGgk)UHML z@XGX7dLg#vh1eu6@0E~JV)7&=@8x3hq#k&%9vBUSg@vD%{ztfR9pJqbcH_~zP5SRo zKOQ)?WtnQ*yJ#c_@58}(5${m3aI>t_O~ue$&3)V=Zi>c}Zw*hkJXPr_rn$0+8-i)U z*dMYm5c1ng;{m=IRUlp~+-qYwhrS>rTbZxSV6h7#$1pcs*5`Zm=?>8;rgZ>WzsSSw2F#l<9A6a}syGS^iiO>s>pP~AvT zEnBf2O;=4l5+QjBYkd^}sY$F%9t$s=(T4%I_$8xXI-`$x1V-}uHSqesGs);z!RU7a zW>82@zjRKYR{l3u#P{L01h2m)@VcehaEI&2SI$q6?%RFo`~>N~_1DhNkFpEzotNnO zDsvpG4UX7o7}O)lMZe}_t%2p`Vg2C4T6)W$KL?-2TQ0LyWR{A|QhB;8l?}!-Ix!{xFGSWdHS&(lJMGUjW#VH>)lnu_L{hGqD^;y6@sWwy^2vVEk#M&|SE4k=|m z&zZ7fx4m&lSy66;IJxtBOO1BQP6#tjv39(!VaST6uNtn$N3ncqJj%Gfhvk*R3U9 z>%Hz=F8Yj_uHpDr*I~NdH5@JMI!r62X zg*u^2O-rR#U>K1RvPQ(B^=0Zn<#Z@9VvMM~z zeKRj4^y@W2zZ64P&H1{euM+z8D9A4f{Mvco*9LTVXR%aj{j6na|LzB34Ggv!Ov-Dw z)SmM7-q4+w`tUdL%GguQ?s_)S0*XxX@a2JrUOP`0JWi^!7DDLZI(rwc8qW#-gRj=M zd4*XGZkd0b9NR14KT@`97@w9acoZYV$8c3gwRN>W_0C<;G`yI2=Pt77qVmpN2=Cm* z%sY3%y>k~+@7x7?=Pst+Ir!-Uy>l1BJJ%b9(Hbx!!DtP*Ye`KHz6OPm&9<%%5QpQ zi4H)RX+NYnTY|!Yj!n$v(-qG&)3nk1}Wx2Pmi>jfdTKA#F&gKmJTin$a?WSZMxD_ZQYVh z5|AYUS38s=@DhZ}~v@=D5spiMuIWX0&p{&k*LO2pH^&q?y{L>@HA3l=3k1Rx4e=EHr z5+t=PkkqI1{j>Aa{r3_u^=Mum37C?AsV4_aiRWNkKMNUI2PO6RuES34WirptP2U3S z+Evif3i6^;sgl*kv}&I(*9yGBN8aZG_ZsKMW!BjW2LTu$RKhWq*mgc>fE_jy^97}ATfMGV;7vz>%~37C^O z;#Oe2R-?RrJ^8e-_+Mj|gxpQX4fB9;!{nR=1MzBM(qz(8%7LPjVFgLUISv?2#sgA_ zY2WLGsZuDOzfc&2UUFx%FrT?uje!%cDWr6~cIg16u_z8w8eXR~=pYU9*pQ+i6oaD5 zlR^qaO*eoo5h8Lq8)1S4MZ`f>AX?^{tetD3nC5HBH8CC2aBWp}RHw66uE{iOgw%6N zHlAuC5=Tfar(~m8)*K~12Rsg*(0Q$Te9htlUfD|nWmR)IOT*Vk&sz( z$qTL0yZP6%Cvm{iLOALdV4>VW8(%Sjf-3kcIjb)eegCF!^py6Tku2tISGc$5fhIpH zcU%>c5nlc>jqg*-z3SKV744#3%H6)0ezT&w&9Z` z;U1(jrC4r6Z%V=3kh&a#U>4HIl#22x`&&>vJ-gft8`{;`J9o?4C&0+ z3Cc=mhID4^G;yVk{s=l#iluaB?2uqeXNFXiPcM(p+?lbyf8xG5CmrdaxSNAw=3{E9 z`lZ-iO+Qx*Pc{YnP!(%t^x{zUYDg{7r8~#FY86Ce S(Trx%jQ<63B6;BU&H(_01)C@U diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz new file mode 100644 index 0000000000000000000000000000000000000000..89ed9671e145d630b3093b97cd8b5c86065e1fbc GIT binary patch literal 1488 zcmV;>1uyy^iwFo9Lv~{T18{X>Y-wX*bZKvHUoimfSZ#0HHW2^({z^ zC0!SHNneH)MLq@Y{zo!04;(o!>Vs0@5qPu^6-v#4wsdNKedy5 z`h4JIOtZ5pmrio$h-xkxkrv=2IV&1CaB3yzP_c>w<66kdNb!O-jmVtDX%>wrYpHb7 z%{HK&gKoZJC9dL}GBt-rLx$rGpFs&$F{?l)_iG<_+H0Am@*b(L2aZ;4gZLeKI6@^+ z<)9Squ|l0e0k~(nurq+VPJ$|cnhF}(Dh_&4L;o<8Qf~|WvC@L0HFjWc(?W1&09~z? z+zo%1AmXUaTA%KmKU!9ZT!89>$)woA@=8pFW(Ap~*T6M7%b?Po;tq5oyD|sdW6||2 z;+V6dH8UPP^gKE`x^DCzaOhJ%I2?7K^A)4Sgp3kttEK9~CPl9@&Y@|Pn9g(`{p9WV zd<*`hhKd|pg&lyqm99#e?ZMm=ux>11&MJ-33y;~6D_~vpZGyQU-3DT^;Y4V-vJ3A* z(wEqB#kb9Ni1Z#zQlM%&nEEw$tU|?9&1aT#6XIrAvzfBMGux$J z%oy{sETTi-I=Hp@(giJATDP^aix#B`;IeiSFOI{QQa_~rAUtqbt7kUI@0K}=dU;zE zt2AW!BFo6+7Ap4~ZadOy)hfz2hkF|MuUK`1+ZofWvj6`=zIF;#_auc>961T2Boozi z7jaxb156=2MOc-QlY2HI7$8qVfJpz;HFtm;2VU$(ejM2-ei|2XlNh(GxJe9c>xF`; zXtY}V;~w*#(q+%dRsA%5Et^@($c0Qn=?49>nWdYfJ^6K#`ahENStn`WhT$l1N1-2Y zd;S+>_iJ(r*BR?MeKoQvw)S`M!{5)^-#+>~9J(VKgg;36d6RO_RsGaFuC%O#B&V&? zER+58T#xmI*C`s>GO)s;%~}HU6zfd1i?P^N8%MWH@c#Vg^U25a9p_Xq7jS!py#y4y z&)S#3^Uh>ry<&DW!^kQJF7Mp2ZH_6eFZEwUJ zxGYNywnM3=M@#YMC|w+dhxDj`DJwX(Q)aDJ6=>15J{H+l=aHW9^M9^`HC4aNO&?h= zL+Ea2XY`;kdg8vw`TnliY~TG{$W~jAFH$9T@zOUp)T_+;ci%I9du_OEG(2;4$8sL^aHTUL6cvKc>IkGtYPC zBzjvG2CT{NrZ#IETF2ccRUs-HqER$-!zeVN0ak(<+n2zQ2+h~fdE=WH45OfU^jWNA7{r}a->|)jy>Cx_ft`;}oyZ0~|8)8n zxxf>Ub7+xQrK)FwleaTzu6a%_YjJ~hrk%x*JqLX&)eZS5^o%faiBkoT4`7Ig748X} qW4%T$TZQ%jcU%gsjrJSXV2ShM!^!3N%}X@e*S`TS+)w`P761UD_u|6< literal 0 HcmV?d00001 diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/entities/publication/publication_0.gz b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/root/entities/publication/publication_0.gz new file mode 100644 index 0000000000000000000000000000000000000000..85706bc31f8d9816f211cb79173864c530081e30 GIT binary patch literal 10874 zcmXweQ*{_o!VQmb}#b@%G0-qlSU z0|T@5;bsB??&xN0Yh_~OYUN-La^?5#vc;A3eY4jGmRh+ZYyAWQq&hXP<7-U@uGTf> zWPI-KrodAO$H0nPOD)7(*nhp)fTYO%HJWNQmBlJeiegalbmhd0yH@1<^5mrcq+s-> z4>&Cl+|^$yHoaZq&HQHV^?$K|?}0y0Bq~;>${jm*(;N05+>5eQVfHjiv{<7rvFf*E zj%`xw#-V7YK1}3P-^OwZNIVu*6kuAvCG?f={=L|HcFj6dr~T8EG(ZJ{f7RlC%4jlt zS09&JGGL8n7^WJud8v?aY>YH$P^UlnS z2T@IZ53J+ReL@MM?`mO;NXyZAp(+-!NSET<6;#sTh!pKLON% zaicu=NIH%=%AtE$eew5RTjII%tS=(I@6;I|V+j)>OFYzA)#n8zI|(H}Gxo$sw4HTN zljGGt`wJ2w9X-MkE(RY%dVfD$zkV6rfi7eh5LMs)!3rWf56KNl7jY*XLK9O?D>6zb z0BMG!hAJ>n!YW8fy|qt6_I3({>WNil7s{7t>}G23-#WAba6)A|j=zyC8cobIA0du% z%S;JOH8lqIMsuBR1_xO5M<5eoo6U%PIKWe4;S@Tqg_y?z5(nLZhgMK{pKUEY-32C4-?WtgY zk}oiRqVVN^u!kteSAe>-U#K;cj#TzS;X#;n=mo`jjf#>&>ahs%lk?zE)C+~4^`jLX zR>8g-nB=NWEC3mFd@u6BjWm^K|E7^(XcA~UaYE*m&>!ZA*W={x$n-r~672@|+DLj$ zFD3@ydxQ2CL&{;73MB(QHWuDrn~$1;M>fLaw+w};Q0r$RBv|L=g8(9F&e49B@{B7l zc>{OxQ9b@#Q|D*>&{xm z5TZnU%KkOi<`WO3QN431S;OvRR0Q;nLm5o}=tDXHsAVm<(yHNpSRs5qwyl;pr6BbC z`@3Pkqb%9M#W8_~e?R0q8NS-vMW0M!W@EzhYjUj{EEay&DJQIq94l8d=2pu(=tu@$ZPCX- z`bH#8*bK)v(X^tvMzHmTi-PDL>LgkmFrv!30j`Vm$j5Wn7o1-Zf`SqJaX+0ga!{hh z^#W|SRZSKTHV`>!({!%j%cHU&ueFFV0ek!YLVDy1XNTKN0=P`hEMJ! z_&oLMR1_iLnU2yUpDY>C*Zj4-Vn~5W!!lwW*#32I&AF&wyO{pN7#8F-HWXHTN(b~1<6sw_wEZkNT)ojSE*ypgjq+J-6&})wV!yw zGLh0*^Qd&D=kSVU$D;H?tF#W1V&08)xh0W>!ML?9LqfFsMaULx-M4tkM1Na=Jcs2X z6e&7X4$&mS!N~=RM9+UQ-uEn2;9*6Ot_-vsmzj)fiKres)QzQDL)ruo_v4`ka$yX7 zG}yC@)K0~ae*R`LKa;oDxxbH(zjO1n3ODQFb{(&A8E5BHr8VU)4r`152)wE(Te!0; zL6f_2(hIWmsX9yt3S+34J!EgU;=Zzap!j>sx@pYoAJer7x|v%9E-3 zX6b`AL{zwi;`BnH!m`SU!4Q#O=I2o6iS9T!4=4;s>1*@;vOu1(%*wsl1N{ecGSQcT z=-~4^ec$0}AUxLN;$O^10cxIiY+@oUBpbdObbmm&7lOCpdcW!LLFf8DtRElu%e?)u z>2Joi&59lvV|x@#YkwR5K2_2rpOgw z2c?;ZQmXxW@pKpM(62yiSVg2muY`%pmgww8A24Ip9ZdXeUjzorhmj7DXxA2hJ~LSV zGN0C^+o#`)*qf(r8 zM}GcOehOR>LAM#^6s|-x7b7j|#S}NUdeDHSpZouG*egga3ZqsYH(o9+$pCvM91J}E5t(N!;J{=CNuq+4EqS| z)6$5yN)c>Qo-o#fN<=}5xIroqJE<*%ZiOdN<_dbn6q1B z+(fOu5m&zu1r%zp;Toc?%h$si8p#8z`GReQd>-InVe}VX*Vxtg; ze_YYiMW_(ifAZzl9PLUO`q(?h1}2eVuiwgTE!_t6Rbi+N)og_p!4p#PJs0iuPKMEN zQUO+IejSLsgKXJ#)o26r+$f)iSEeD!WQWJY+^L}op&Q(U>VAN!(+aT2m)QZr6G^G| z27_7sB;e!Od^wSf4Vv`Kb0hbRR$~+X#_rg09u8#37J~1)^3hWoc-GPt8r_>X1F+&`4H70?onJn zrk@@t5J%(lk3`Sh82T_-4a~+t&TyEIIqbBI@+OwvG;{B9UGrh+d%wn(Zk$fapBjW{ zPMPibvZYAL=48JCm&`W2lKY2oL0e--<0d|pX?A+TA#b5=iyRq2TZu*6MgpNsvhWZN zv$C?{>mTKK z{+dOS^!pIsdyRQU{i?swOXNXwhwbT?obO$K>_{W(*t@7_v*)McBc`veS>~maD~&$VcDB^JLv+1ak%xTx%-7TT%qkDl?E$VzY4g#);c{}$2M`Nw<9h~ zusRvC63EG*?WRzuk;c5BQzq-P zsjOwy2+r;ZtlXN$zlj~u-9B2WIs?qlX$VB-LWk^URrI_iX74@vG?w6@ImXr@&Q$u& z7*qI)9=AquDoGRM9YryzlS~J=R_ttNO6t*&|7vWXsd4+MXtLA;5T++Mz2^J6l2&?= z0DbR%or~woIKwnjlX~mh(K($>bIPjzJJ2dpX6R7Q2u&tXW&E<1nByj1MXY!#M#-86ZsZ21#ew{l;3!Ip^ySIy2ATdQBkFm7UzG4HgN392|+sK8xGNizrQ&@^5jQnX!R^a*ua-*aqjknBb^1E^O$@F5CwFz z-Bz36>xXcOacYKC0-#g(+vQKeE^@EE!# zk8%h3v1f_oIQS=C$k@B{Dq|709VFD)K4AAer$NniYVpV+!niXBBHoIv?<$2dph0+3 zaBVz1N8ww;#vRt?*)5t8)vn=>OK-uDHG>H7!V8tvQzXJXoRP!Pl$*##7%+*CE)!mWyrjRLMIc-FVn z$zg26_Bgw0UWj+RR(5{lP9orPbm4rswsmm^62-pl_CAc*$f~avn-b}udFJmtNJ3Zm z5J2q`R$3gWOq<=>-gTI?yfEaQVgt@7zU+Ji1Qy;G)W$WmfxWDoT&~ z+L@B~3Zzaz7mK(ro6u)Wa9~gLW-18!hkkG1))Qnm-j7GD29Fa`%TZj*3Ca9b+D4DM zMpGXb4+~%*QP@M09m*1UX4yHRhdOheot|zU-7*&B(ns_i3`kGB)no|pR^p9U{6;YE zn!tD8X*-K6=tz5d0OaSG^W*s%y`*hQ4eBUlgpBk!lsG5@r9G{J7c4l@KOue{7*|O+0#mdL%tzobWHlwQ zCRRL|U~-@WJ$68hK)YY45(fc@C2pp}Ps`O^seh|4U^amx$huFRQmO4Ujw?_2jHhAb zvfqrpO9p3=lQ`Bt<~3Ic`dY$6rlnF$-AHnRiAmWa+0Q5tMMvd*utx-EffivepBQp9 zd4{qX1|`N7{s8I6F&08F(^1IafRKi4KlBY_90Giy2WOu@F!Dsf#Tel80TUA&(r9XE zjhuDjn)-uc4+sqG&&0)GPDFZ>X2vq$?y}6g&INr)E>1$ds}=LRjdG?)#5o!UCJ=5| z#kfHDgC%7X-sI!!r~w&3X@rH77L+&N@+BX1N$~2Py+uC@D;MbAsBkdJVMOB6vhj?y z7&g4}f)a0#erv%%;yJEIdEy5lp&im23TFO`xUuBTflHkqpLFj!rDlB`eu>ef7UyGb#R$h1`?Pw=|W&YAP90xYu+_^y{0e7#^$SkOS3q5hZ zw`qgeCNph%FioydsYBO6DN&VBEXr%##go4p3aOG~)L=Ue2rCw>!?za)u?>s=O5$du z4I-#1V-DAbF#>xcLf}0so<>FNxX=%zWL2}nCs+Y6$KQjlpq1szLm1cO6U>c3@=3{n z7CyM(xqMJr$iP$1yN03D?Te?u3FE=3ElGhuAdZP#S&)Ey*- zd7nmivcfkzE9%1{;`1>U#5MA-oHXyBu;DEtV8ij5p^bA}H_*0`_At#j<3HuulyHiX zhulT(Zy06Y=eGePk@qJkxwJGOoLtohkw$#*Ir9@m=7iuFxE0pCWo(a1m?dGN1#q}~pc9A(g}PboCC3t_hnk%4T!nnL4 z=cCW9K3H<&tMIG9YTE4%^-yv&*;$_`Lo~E9Vs;!s%;li{510_}n3$K?3HGdc<=VEw zi^A;2V6WUQ+keX=?2w>pB&IQ}6B%3){XFdEDIW2NG#3jMp`${osC@-&v|?!u7ZeD8 zoC@n;Od7=~Au+bnUvkL)rcM#9+0{Tb4QV`s(C))8<14D3N)~vi2fz7}L`Cp-l`A|K zYhs@Q(kWKUYx~!~6g_+k5}9l6ecwrSiy^`m1fM&QY{SiXRUikT*TwQ1w_-Q_QWN`R zIHF9!H$S%rOfqH}rBzVBze#TVK(0}YX~YjNvm#tlGjhi?*ED(-AG-WjwFaEguGSh~ z{Rj9>qWqkzwIWvzl*?ZfH)k;s6lx@2BMO1ED0N_WRmj6k7is;Pv@{RP@Wft!jtivT ztMmbl(VjjG7>TCBr?g$y&rS7b1{V)xcX5?SZ`IV@Fi_L|K6ku)ouo4x+2dUkD{2 zj5LaE(lC|fIpgMw0l3O2;%N!&(KiXr_7(A-Ks04_LZOivPG0+K!)pB2EuAyC1bq_t zy1iWy!6h*+j2|~^I6Zf@?#*`HUE`EQ5%ga?e|qDF=;b#!ITj?J+IPjTM#O7xK}KRl z!u`SCt=2^(^J!Qg;osO6561gbok z{j01uJL}H#6kptLwu6$lzr0YOo=O$*nKZ7p3O~Jz4$c+N_UoW;G9}cXI-7pJkTMO> zTF$g5dZx`#$udk=S&+^mCiIz#;-baiqS@f?{o07TcZm1HSGhIZdVNWWhMHRFzi_13 zVG|1T6GoU@=Bxg+Yld9|#gu8an0oHX>Lx}uLbhoBwRhT8tn!^M^d?*I#dI}GVykE2 zhipC4J0B>UR)$NMqXnO@fl-G+ZzgLuSNwAT#U?WJ>FKZi2;x~L_gO{-4}rV9gQg?mG1rfS!~H|Y?=kHRzsacw z0;V}WzLnM`STSpFqlNv6+)920OiGL4MP&vH{sIq7|L z(n;<)11R&t@vG4o%#x*k^2DpW=%2T#$oXa5JZ`;R9aWebVRX3}t* zZ68AV7c>KLR47V0HyWP3s_dC+aive81U+bYc5(X>4lTaP<5HT|Zy_S---hhIWEhe$mx)6+sND9Us)5BXk8TJ6QE zE23@ITkvt?=B39jOn`&V+MD(I8Y_RX7HT%>_TBzeXkp~~gb+Ipu*dh<44x7mSa610 zKUEK~VDOo?>B!9Z)kok}&gF+i*L@sj@_K~CAH^3@dtA%jT3zMH42O0(aw8lrQ~<&N zTs`tdu{FkZK|@89C7+-S{g22ZuAbIx0abec3wkWZaG{!G&bhO16)yHL)h_HZb1lY( zEUp&w&nBk@el?~EPJK9etV0{Mz2ufhrJCv8$F*yQI?bx&h`%Mz#m61XZ{9~bUgZYT z4jYiq)=&5I?SNfX_{$Z1EVui4{m*VI5nX)MO8b^J9GOd_hxT>9+jgLh%>?A-vP6}V zV)V`uk&_&jFdxI*_kv1)86{ZWi6;~>q;KUWb3Q0CKH-d`288cs>r=}SHdss`#@l&K zQQk74I~-UEiIM|2SPSLWa?C(pb;(g)Xo=cmC3uW6ko+piBtF{k45J*yOgcNN3{G|Q z+Uq6?&^Q(`p{p1L&uL_0EzajHK26K;;sQk#zbXPP9wY^ygM?zGd&V!w;-_Q~nd(KF z7B#hnra~*1JNTzZUoLkMd~Ovy#Gje{ZfQ|47=4XVFn!RPZ(XoE^MF_QHEmJrJ{KkEaM(3_{}08{hMDFT?mzgV|eg5CLxFylwV;Y4UqC z8c4IPvHxTTR;QbZXwjb5>Fgis`s=FXfxP&AvMwj@(v0vrNAv8J)+JjLpP*h?uv{`J z#(ri@2Yjk(=4~UZB=5Q{G{FVaZuPLYr_3nxo_?hY02cNmzuyFeWrb)EBm^j zZvIIrXpP}JPR))u7v5)Gl_3=uB4NKi1YHFz792@_PL35gC%~PjK#~F)xXZVq> zUBtQjmSc1M6<5Iy5+Y|~Q~^H4paThb$S8Le3#AD0h50^^XQ(u&0=;v&8KwnnC^9yY zoeYbWN%i#jcqERzkbliAQ?y)xc_E`hSHPs!$_ToP%bC1t@Q9E7K}5K(74|I*F$a+B zCfkPg_Cz+$1o@%kv%iMs>S5EVX6;vJX7zH1Z;Xi0N+37cs%u^6)x*hlS|acv-_7zI zxnVF(l_~O@hNzeScE0t&b@E6oI}Al+Zf)bjae18rh}@^Mg@tL(&Smyx*KD^3{;k}o zKLniS3YPpC0mNZ+imXY#Ug!0YUb=PulhZX_I5o-ZLDS=s?y=h;dyvyb;B(5@;c@MJ znG=%tKceD&$`it@10q3zTLYSIhnz*2MIT6}Qmvsx3W=%mf1%)1IBa>8#mA%|WMJI% zBL2n6?G@-fAeM^`ZpAh! zoL!K|*BN+h`7K%a>FDVH1?j*4P4vxGx`T`jjUDkWC_uPaOwGR)!-m|}eD$cZPXVdm z?XzFJ7PuP-*7om0L)alVC3>X7;CD^JLDGJ;E2HlKN0Hb>w3Hwglgp}smP`(HyTD2nvF;%)UQKT@xdHwF z`keW3-@@3tD6~noHaR=e2(4zFS}mnYG>gk}&kT zG6K*?-sGmHs42@+T}ZkuQ>Gkq@`B*&Ew%Y2KI2n4GO5)Z;p*?cy&5Q2g83N3yf6E| zZg98C;cV7tO`p~>po62ZR}GuN4cLLX1MzuvrqwGFuTuGV?vXc^zirNO-mzK$MAq>q zOG-b292!pCRJwfYrnM<(Sn8xBpyIC$XI9y!Rw4<3IPu(>=v<&BwZoM=;_OSHJ)dDz z$(+7J_r&_Qi1KI=62Gsr%>U@d+u^2(&&=P7O0v2RImtzFmk;c5p#BSn5ldXL2i*c7 z1>qsbs*E;C`rgVMaiCeAt!eX2ur5iu+x%K}@5%g1?+Dq?=}BhzgD z;J+P|QwrP_W{LSu-V;fx3SwxmowAHxMe5$^0sKB_(DV$&MgDMDv;@s}?8(&SB6<_r z|Nl?O;xyK@!SjXa<(XpsPr@XjW|(PldY{};*|Kxp=F^8KGhwW?fbNvv;m=l4N__Fe zV|k&DjRv;M%eI{#m(99TLO;h!!8iYa3Mh0 z=1l#bZP@J&Eow)bKZQM`_e~<(ZgMR}n<_bzrv%8Vr2C;vi{NI+DEF4~-HXmc6(ySw z%H{?99!NGKo14i#9!l)XF)N|BcT1&ga>8>UsYt8au5ovP5Z>y`Iq%<3orCa9aSN{A zBaAZ}l|iYfo6Kl9^poIbO<^@*psvcs{a3FJt8=`;pUF|C{VODbE3$F_?bgd@-_B~{ z^gXQVqM$P=e8&Ynl9OO1vBG5A#{LP0PxMm~bL;}MZ`k7pylh;@{mrlQ`dcx^^F5Cb zkrdw77yB#G*TMZ88+})4q$H*sE}#zsYfGl|lhBHZ3g>o3p43S<=*V*UYnY*J?rwhdW(O4~uo&iyr`kd+Vr3ai8;ZmO!YWsqJn7){!S-RxJo3RU`3S&bbrXX z)6nlg*7Ch~yPOSFl%8R36S1&OHP=muykyU8aW5joqph(iw2v8D36%cOcgHekBi?Wp zz;Tj$aEriQh6NCcNtnK7mSnUZLm-}RS!6c9(1LNw_L>&ZzZ7QFo_O)Vme=45=UWYv z_Qu8a=PyTz*c$J$2?Qrw8uWRZBTpnc?3r>w4>O#aSbU{T7xXKYmxPsw;*6{D+3w9a!IyjiU z`4t6mN8JpLlyObUG{{>W@=KI zLqGMfl&Nt0JGU|8tK&Kp+wPTfs}k|g9X<@RnfD=YM0w2osVeW+AT-M$P3)?G8;lI zMfh!~%GCujP{j_|0V%HiI)tjX?xp-HO44GC2%-tnUZ<<;+#7yaTCC${{Nts_f&^d7 zzlYr(MKKTN&|U5ke=S-4b;C2Z;&kb3o;+oGeQAz*oI=|0@Ufdkn0A2n9Dzi03oMk|*qgrEPE`I3K|g60nS zl<1!BeJCy#>djJW&MM*ah#AAsB1FiZTpPk8XdgT1r?n9_W3a^&RbL&XQq?eHs1{7% zptKq2?5{2BA|fS}B%hnsDF<_+)iw$;h0NM$wR#Ulje84nxWIbSx6YElc*Sb4Vr6D$ zjphE~aFC>xHmR$`MU~o?FD87GWb#&FUzsk?5lTlJ5cwEuH?!84_| z3Bfe5ZL4^}C(P<^|C28T$y_TA>PDlRcp)&f!;JNRMPNi@<|1nN7Z2mVws~>lpA%-0-S=A5M23vpm;jvz zTN>pyRBmKTl)V*UJgWeCg2Ibp)KZ=#+2*_vZcd%WpCsb7PQ6@J)No_f@p8ESPkZAJ zJU?E9$^Xtmg@svHY(rDR(Y^>f=~IKGzULPU9Shk0yiva%dcHUKM{$SLuR|vNJUh0b zw-(to9V)gX#`ZRD2fqr_WRll`c9JF1-Rfyf2c;3|Yd>s`I`_(Bqc@x~$IR4(@H^X0 z2JcRfMS*(eMUXXr`^P0?duL$klA1g%4RH<=Pup=?LjhXFE+<-|HN-)|z`- zj9Hmub!ufLFU;r*5zz8(I7&#`I^!6xx_jEz^p}5h3a9p-R?T2!>R6o%uwJ{Q;)R(o zTjfburz~rxY5Q?f)JV5WJC&EGe`l>@W|#enpK9`x*ZE;xmo6VsgwI%rm9SccUM+gW z-BGGUtJ^SoWH3lP)wn=~el;_ic0{B^%N8O`J$eLhvQef%i3ZCdjrc#cnrr0n8w4Yn L#g!=;4CMa+cETas literal 0 HcmV?d00001