merge branch with master

2021-08-05 11:34:20 +02:00 · 2021-08-05 11:34:20 +02:00 · ee13da9258
parent 35e395eae8 74afe43c3a
commit ee13da9258
18 changed files with 523 additions and 94 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,2 @@
 # dnet-hadoop
-Dnet-hadoop is a tool for
+Dnet-hadoop is the project that defined all the OOZIE workflows for the OpenAIRE Graph construction, processing, provisioning.
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -25,6 +25,11 @@
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
 		</dependency>
+		<dependency>
+			<groupId>me.xuender</groupId>
+			<artifactId>unidecode</artifactId>
+		</dependency>
+
 		<dependency>
 			<groupId>org.apache.spark</groupId>
 			<artifactId>spark-core_2.11</artifactId>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter;
 import java.time.format.DateTimeParseException;
 import java.util.*;
 import java.util.function.Function;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;

 import org.apache.commons.lang3.StringUtils;
-import org.jetbrains.annotations.NotNull;

 import com.github.sisyphsu.dateparser.DateParserUtils;
 import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;

 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
+import me.xuender.unidecode.Unidecode;

 public class GraphCleaningFunctions extends CleaningFunctions {

@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 							.filter(Objects::nonNull)
 							.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
 							.filter(
-								sp -> sp
-									.getValue()
-									.toLowerCase()
-									.replaceAll(TITLE_FILTER_REGEX, "")
-									.length() > TITLE_FILTER_RESIDUAL_LENGTH)
+								sp -> {
+									final String title = sp
+										.getValue()
+										.toLowerCase();
+									final String residual = Unidecode
+										.decode(title)
+										.replaceAll(TITLE_FILTER_REGEX, "");
+									return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
+								})
 							.map(GraphCleaningFunctions::cleanValue)
 							.collect(Collectors.toList()));
 			}
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
 import static org.junit.jupiter.api.Assertions.*;

 import java.io.IOException;
-import java.time.LocalDate;
-import java.time.format.DateTimeFormatter;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Locale;
-import java.util.Optional;
 import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import me.xuender.unidecode.Unidecode;

 public class OafMapperUtilsTest {

 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
 		.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);

+	@Test
+	public void testUnidecode() {
+
+		assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
+		assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
+		assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
+		assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
+		assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
+		assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
+		assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
+		assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
+		assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
+		assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
+	}
+
 	@Test
 	public void testDateValidation() {

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
@ -0,0 +1,127 @@
+
+package eu.dnetlib.dhp.oa.graph.raw;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.FileNotFoundException;
+import java.util.Objects;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.graph.dump.Utils;
+import eu.dnetlib.dhp.oa.graph.raw.common.RelationIdMapping;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import scala.Tuple2;
+
+public class PatchRelationsApplication {
+
+	private static final Logger log = LoggerFactory.getLogger(PatchRelationsApplication.class);
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static void main(final String[] args) throws Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					Optional
+						.ofNullable(
+							PatchRelationsApplication.class
+								.getResourceAsStream(
+									"/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json"))
+						.orElseThrow(FileNotFoundException::new)));
+		parser.parseArgument(args);
+
+		final Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String graphBasePath = parser.get("graphBasePath");
+		log.info("graphBasePath: {}", graphBasePath);
+
+		final String workingDir = parser.get("workingDir");
+		log.info("workingDir: {}", workingDir);
+
+		final String idMappingPath = parser.get("idMappingPath");
+		log.info("idMappingPath: {}", idMappingPath);
+
+		final SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> patchRelations(spark, graphBasePath, workingDir, idMappingPath));
+	}
+
+	/**
+	 * Substitutes the identifiers (source/target) from the set of relations part of the graphBasePath included in the
+	 * mapping provided by the dataset stored on idMappingPath, using workingDir as intermediate storage location.
+	 *
+	 * @param spark the SparkSession
+	 * @param graphBasePath base graph path providing the set of relations to patch
+	 * @param workingDir intermediate storage location
+	 * @param idMappingPath dataset providing the old -> new identifier mapping
+	 */
+	private static void patchRelations(final SparkSession spark, final String graphBasePath, final String workingDir,
+		final String idMappingPath) {
+
+		final String relationPath = graphBasePath + "/relation";
+
+		final Dataset<Relation> rels = Utils.readPath(spark, relationPath, Relation.class);
+		final Dataset<RelationIdMapping> idMapping = Utils.readPath(spark, idMappingPath, RelationIdMapping.class);
+
+		log.info("relations: {}", rels.count());
+		log.info("idMapping: {}", idMapping.count());
+
+		final Dataset<Relation> bySource = rels
+			.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
+			.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
+				final Relation r = t._1();
+				Optional
+					.ofNullable(t._2())
+					.map(RelationIdMapping::getNewId)
+					.ifPresent(r::setSource);
+				return r;
+			}, Encoders.bean(Relation.class));
+
+		bySource
+			.joinWith(idMapping, bySource.col("target").equalTo(idMapping.col("oldId")), "left")
+			.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
+				final Relation r = t._1();
+				Optional
+					.ofNullable(t._2())
+					.map(RelationIdMapping::getNewId)
+					.ifPresent(r::setTarget);
+				return r;
+			}, Encoders.bean(Relation.class))
+			.map(
+				(MapFunction<Relation, String>) OBJECT_MAPPER::writeValueAsString,
+				Encoders.STRING())
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.text(workingDir);
+
+		spark
+			.read()
+			.textFile(workingDir)
+			.write()
+			.mode(SaveMode.Overwrite)
+			.option("compression", "gzip")
+			.text(relationPath);
+	}
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java
@ -0,0 +1,25 @@
+
+package eu.dnetlib.dhp.oa.graph.raw.common;
+
+public class RelationIdMapping {
+
+	private String oldId;
+
+	private String newId;
+
+	public String getOldId() {
+		return oldId;
+	}
+
+	public void setOldId(final String oldId) {
+		this.oldId = oldId;
+	}
+
+	public String getNewId() {
+		return newId;
+	}
+
+	public void setNewId(final String newId) {
+		this.newId = newId;
+	}
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json
@ -0,0 +1,26 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "g",
+    "paramLongName": "graphBasePath",
+    "paramDescription": "base graph path providing the set of relations to patch",
+    "paramRequired": true
+  },
+  {
+    "paramName": "w",
+    "paramLongName": "workingDir",
+    "paramDescription": "intermediate storage location",
+    "paramRequired": true
+  },
+  {
+    "paramName": "i",
+    "paramLongName": "idMappingPath",
+    "paramDescription": "dataset providing the old -> new identifier mapping",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
@ -100,6 +100,16 @@
            <value></value>
            <description>a blacklist of nsprefixes (comma separeted)</description>
        </property>
+        <property>
+            <name>shouldPatchRelations</name>
+            <value>false</value>
+            <description>activates the relation patching phase, driven by the content in ${idMappingPath}</description>
+        </property>
+        <property>
+            <name>idMappingPath</name>
+            <value></value>
+            <description>path pointing to the relations identifiers mapping dataset</description>
+        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -551,7 +561,6 @@
        <path start="merge_claims_relation"/>
    </fork>

-
    <action name="merge_claims_publication">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
@ -760,7 +769,42 @@
        <error to="Kill"/>
    </action>

-    <join name="wait_merge" to="End"/>
+    <join name="wait_merge" to="decisionPatchRelations"/>
+
+    <decision name="decisionPatchRelations">
+        <switch>
+            <case to="patchRelations">
+                ${(shouldPatchRelations eq "true") and
+                (fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")}
+            </case>
+            <default to="End"/>
+        </switch>
+    </decision>
+
+    <action name="patchRelations">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PatchRelations</name>
+            <class>eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--graphBasePath</arg><arg>${graphOutputPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}/patch_relations</arg>
+            <arg>--idMappingPath</arg><arg>${idMappingPath}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>

    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@ -1,11 +1,9 @@

 package eu.dnetlib.dhp.oa.graph.raw;

-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-import static org.junit.jupiter.api.Assertions.assertNull;
-import static org.junit.jupiter.api.Assertions.assertTrue;
+import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup;
+import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames;
+import static org.junit.jupiter.api.Assertions.*;
 import static org.mockito.Mockito.lenient;

 import java.io.IOException;
@ -25,15 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
 import eu.dnetlib.dhp.oa.graph.clean.GraphCleaningFunctionsTest;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.Dataset;
-import eu.dnetlib.dhp.schema.oaf.Field;
-import eu.dnetlib.dhp.schema.oaf.Instance;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
-import eu.dnetlib.dhp.schema.oaf.Publication;
-import eu.dnetlib.dhp.schema.oaf.Relation;
-import eu.dnetlib.dhp.schema.oaf.Software;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.utils.PidType;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

@ -74,7 +64,7 @@ public class MappersTest {

 		assertValidId(p.getId());

-		assertEquals(1, p.getOriginalId().size());
+		assertEquals(2, p.getOriginalId().size());
 		assertTrue(p.getOriginalId().contains("10.3897/oneeco.2.e13718"));

 		assertValidId(p.getCollectedfrom().get(0).getKey());
@ -261,8 +251,8 @@ public class MappersTest {
 		final Relation r2 = (Relation) list.get(2);

 		assertValidId(d.getId());
-		assertEquals(1, d.getOriginalId().size());
-		assertTrue(d.getOriginalId().contains("oai:zenodo.org:3234526"));
+		assertEquals(2, d.getOriginalId().size());
+		assertTrue(d.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:zenodo.org:3234526")));
 		assertValidId(d.getCollectedfrom().get(0).getKey());
 		assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
 		assertTrue(d.getAuthor().size() > 0);
@ -351,8 +341,11 @@ public class MappersTest {
 		final Publication p = (Publication) list.get(0);

 		assertValidId(p.getId());
-		assertTrue(p.getOriginalId().size() == 1);
-		assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
+		assertEquals(2, p.getOriginalId().size());
+
+		assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:pub.uni-bielefeld.de:2949739")));
+		// assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
+
 		assertValidId(p.getCollectedfrom().get(0).getKey());
 		assertTrue(p.getAuthor().size() > 0);

@ -413,7 +406,8 @@ public class MappersTest {
 		assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, d.getDataInfo().getProvenanceaction().getSchemename());

 		assertValidId(d.getId());
-		assertTrue(d.getOriginalId().size() == 1);
+		assertEquals(2, d.getOriginalId().size());
+
 		assertEquals("feabb67c-1fd1-423b-aec6-606d04ce53c6", d.getOriginalId().get(0));
 		assertValidId(d.getCollectedfrom().get(0).getKey());

@ -567,31 +561,6 @@ public class MappersTest {
 		assertNotNull(d.getInstance().get(0).getUrl());
 	}

-	@Test
-	void testEnermaps() throws IOException {
-		final String xml = IOUtils.toString(getClass().getResourceAsStream("enermaps.xml"));
-		final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
-
-		System.out.println("***************");
-		System.out.println(new ObjectMapper().writeValueAsString(list));
-		System.out.println("***************");
-
-		assertEquals(1, list.size());
-		assertTrue(list.get(0) instanceof Dataset);
-
-		final Dataset d = (Dataset) list.get(0);
-
-		assertValidId(d.getId());
-		assertValidId(d.getCollectedfrom().get(0).getKey());
-		assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
-		assertEquals(1, d.getAuthor().size());
-		assertEquals(1, d.getInstance().size());
-		assertNotNull(d.getInstance().get(0).getUrl());
-		assertNotNull(d.getContext());
-		assertTrue(StringUtils.isNotBlank(d.getContext().get(0).getId()));
-		assertEquals("enermaps::selection::tgs00004", d.getContext().get(0).getId());
-	}
-
 	@Test
 	void testClaimFromCrossref() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml"));
@ -673,6 +642,30 @@ public class MappersTest {
 		System.out.println(p.getTitle().get(0).getValue());
 	}

+	@Test
+	void testJairo() throws IOException {
+		final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml"));
+		final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
+
+		System.out.println("***************");
+		System.out.println(new ObjectMapper().writeValueAsString(list));
+		System.out.println("***************");
+
+		final Publication p = (Publication) list.get(0);
+		assertValidId(p.getId());
+		assertValidId(p.getCollectedfrom().get(0).getKey());
+
+		assertNotNull(p.getTitle());
+		assertFalse(p.getTitle().isEmpty());
+		assertTrue(p.getTitle().size() == 1);
+		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
+
+		final Publication p_cleaned = cleanup(fixVocabularyNames(p));
+
+		assertNotNull(p_cleaned.getTitle());
+		assertFalse(p_cleaned.getTitle().isEmpty());
+	}
+
 	@Test
 	void testOdfFromHdfs() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));
@ -688,8 +681,8 @@ public class MappersTest {
 		final Dataset p = (Dataset) list.get(0);

 		assertValidId(p.getId());
-		assertTrue(p.getOriginalId().size() == 1);
-		assertEquals("df76e73f-0483-49a4-a9bb-63f2f985574a", p.getOriginalId().get(0));
+		assertEquals(2, p.getOriginalId().size());
+		assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("df76e73f-0483-49a4-a9bb-63f2f985574a")));
 		assertValidId(p.getCollectedfrom().get(0).getKey());
 		assertTrue(p.getAuthor().size() > 0);

--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationApplicationTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationApplicationTest.java
@ -0,0 +1,115 @@
+
+package eu.dnetlib.dhp.oa.graph.raw;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.oaf.Relation;
+
+public class PatchRelationApplicationTest {
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+	public static final String ID_MAPPING_PATH = "map/id_mapping.json";
+
+	private static SparkSession spark;
+
+	private static Path workingDir;
+
+	private static final Logger log = LoggerFactory.getLogger(PatchRelationApplicationTest.class);
+
+	@BeforeAll
+	public static void beforeAll() throws IOException {
+		workingDir = Files
+			.createTempDirectory(PatchRelationApplicationTest.class.getSimpleName());
+		log.info("using work dir {}", workingDir);
+
+		SparkConf conf = new SparkConf();
+		conf.setAppName(PatchRelationApplicationTest.class.getSimpleName());
+
+		conf.setMaster("local[*]");
+		conf.set("spark.driver.host", "localhost");
+		conf.set("hive.metastore.local", "true");
+		conf.set("spark.ui.enabled", "false");
+		conf.set("spark.sql.warehouse.dir", workingDir.toString());
+		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+		spark = SparkSession
+			.builder()
+			.appName(PatchRelationApplicationTest.class.getSimpleName())
+			.config(conf)
+			.getOrCreate();
+
+		FileUtils
+			.copyInputStreamToFile(
+				PatchRelationApplicationTest.class.getResourceAsStream("id_mapping.json"),
+				workingDir.resolve(ID_MAPPING_PATH).toFile());
+
+		FileUtils
+			.copyInputStreamToFile(
+				PatchRelationApplicationTest.class.getResourceAsStream("relations_to_patch.json"),
+				workingDir.resolve("graphBasePath/relation/rels.json").toFile());
+
+	}
+
+	@AfterAll
+	public static void afterAll() throws IOException {
+		FileUtils.deleteDirectory(workingDir.toFile());
+		spark.stop();
+	}
+
+	@Test
+	public void testPatchRelationApplication() throws Exception {
+
+		final String graphBasePath = workingDir.toString() + "/graphBasePath";
+		PatchRelationsApplication.main(new String[] {
+			"-isSparkSessionManaged", Boolean.FALSE.toString(),
+			"-graphBasePath", graphBasePath,
+			"-workingDir", workingDir.toString() + "/workingDir",
+			"-idMappingPath", workingDir.toString() + "/" + ID_MAPPING_PATH
+		});
+
+		final List<Relation> rels = spark
+			.read()
+			.textFile(graphBasePath + "/relation")
+			.map(
+				(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
+				Encoders.bean(Relation.class))
+			.collectAsList();
+
+		assertEquals(6, rels.size());
+
+		assertEquals(0, getCount(rels, "1a"), "should be patched to 1b");
+		assertEquals(0, getCount(rels, "2a"), "should be patched to 2b");
+
+		assertEquals(2, getCount(rels, "10a"), "not included in patching");
+		assertEquals(2, getCount(rels, "20a"), "not included in patching");
+
+		assertEquals(2, getCount(rels, "15a"), "not included in patching");
+		assertEquals(2, getCount(rels, "25a"), "not included in patching");
+
+		assertEquals(2, getCount(rels, "1b"), "patched from 1a");
+		assertEquals(2, getCount(rels, "2b"), "patched from 2a");
+	}
+
+	private long getCount(List<Relation> rels, final String id) {
+		return rels.stream().filter(r -> r.getSource().equals(id) || r.getTarget().equals(id)).count();
+	}
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/id_mapping.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/id_mapping.json
@ -0,0 +1,5 @@
+{"oldId": "1a", "newId": "1b"}
+{"oldId": "2a", "newId": "2b"}
+{"oldId": "3a", "newId": "3b"}
+{"oldId": "4a", "newId": "4b"}
+{"oldId": "5a", "newId": "5b"}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml
@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<record xmlns:dc="http://purl.org/dc/elements/1.1/"
+        xmlns:dr="http://www.driver-repository.eu/namespace/dr"
+        xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+        xmlns:oaf="http://namespace.openaire.eu/oaf"
+        xmlns:oai="http://www.openarchives.org/OAI/2.0/"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <header xmlns="http://namespace.openaire.eu/">
+        <dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
+        <dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
+        <dri:dateOfCollection/>
+        <dri:mdFormat/>
+        <dri:mdFormatInterpretation/>
+        <dri:repositoryId/>
+        <dr:objectIdentifier/>
+        <dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
+        <dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
+        <oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
+    </header>
+    <metadata xmlns="http://namespace.openaire.eu/">
+        <dc:title>多項式GCDを用いた復号法に関する研究</dc:title>
+        <dc:creator>上原, 剛</dc:creator>
+        <dc:creator>甲斐, 博</dc:creator>
+        <dc:creator>野田, 松太郎</dc:creator>
+        <dc:format>application/pdf</dc:format>
+        <dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
+        <dc:language>jpn</dc:language>
+        <dc:publisher>京都大学数理解析研究所</dc:publisher>
+        <dc:subject classid="ndc" classname="ndc"
+                    schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
+        <dc:type>Departmental Bulletin Paper</dc:type>
+        <dr:CobjCategory type="publication">0014</dr:CobjCategory>
+        <oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
+        <oaf:projectid/>
+        <oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
+        <oaf:accessrights>OPEN</oaf:accessrights>
+        <oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
+        <oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
+        <oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
+        <oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
+        <oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
+        <oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
+        <oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
+    </metadata>
+    <about>
+        <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
+            <originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
+                <baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
+                <identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
+                <datestamp>2021-04-13T13:36:29Z</datestamp>
+                <metadataNamespace/>
+                <originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
+                    <baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
+                    <identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
+                    <datestamp>2012-07-12T14:15:41Z</datestamp>
+                    <metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
+                </originDescription>
+            </originDescription>
+        </provenance>
+        <oaf:datainfo>
+            <oaf:inferred>false</oaf:inferred>
+            <oaf:deletedbyinference>false</oaf:deletedbyinference>
+            <oaf:trust>0.9</oaf:trust>
+            <oaf:inferenceprovenance/>
+            <oaf:provenanceaction classid="sysimport:crosswalk:repository"
+                                  classname="sysimport:crosswalk:repository"
+                                  schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
+        </oaf:datainfo>
+    </about>
+</record>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/relations_to_patch.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/relations_to_patch.json
@ -0,0 +1,6 @@
+{"source":"1a","target":"10a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
+{"source":"10a","target":"1a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
+{"source":"2a","target":"20a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
+{"source":"20a","target":"2a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
+{"source":"15a","target":"25a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
+{"source":"25a","target":"15a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/oaf_to_summary
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/oaf_to_summary
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@ -10,6 +10,7 @@ import java.util.Set;
 import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@ -81,6 +82,7 @@ public class PrepareRelationsJob {

 		Set<String> relationFilter = Optional
 			.ofNullable(parser.get("relationFilter"))
+			.map(String::toLowerCase)
 			.map(s -> Sets.newHashSet(Splitter.on(",").split(s)))
 			.orElse(new HashSet<>());
 		log.info("relationFilter: {}", relationFilter);
@ -130,7 +132,7 @@ public class PrepareRelationsJob {

 		JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
 			.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
-			.filter(rel -> relationFilter.contains(rel.getRelClass()) == false);
+			.filter(rel -> relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())) == false);

 		JavaRDD<Relation> pruned = pruneRels(
 			pruneRels(
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@ -16,7 +16,6 @@ import javax.xml.transform.*;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;

-import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.util.LongAccumulator;
 import org.dom4j.Document;
@ -43,6 +42,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;

 public class XmlRecordFactory implements Serializable {

--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
@ -7,8 +7,6 @@ import java.io.IOException;
 import java.io.StringReader;
 import java.util.List;

-import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
-import eu.dnetlib.dhp.schema.oaf.Dataset;
 import org.apache.commons.io.IOUtils;
 import org.dom4j.Document;
 import org.dom4j.DocumentException;
@ -25,6 +23,7 @@ import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
 import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
 import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
 import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.Project;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Relation;
@ -137,17 +136,18 @@ public class XmlRecordFactoryTest {
 	@Test
 	public void testEnermapsRecord() throws IOException, DocumentException {

-		String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>" +
-				"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>"+
-				"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>"+
-				"</entries>";
+		String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>"
+			+
+			"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>" +
+			"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>" +
+			"</entries>";

 		ContextMapper contextMapper = ContextMapper.fromXml(contextmap);
 		XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
-				otherDsTypeId);
+			otherDsTypeId);

 		Dataset d = OBJECT_MAPPER
-				.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);
+			.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);

 		JoinedEntity je = new JoinedEntity<>(d);

--- a/pom.xml
+++ b/pom.xml
@ -205,6 +205,11 @@
 				<artifactId>dateparser</artifactId>
 				<version>1.0.7</version>
 			</dependency>
+			<dependency>
+				<groupId>me.xuender</groupId>
+				<artifactId>unidecode</artifactId>
+				<version>0.0.7</version>
+			</dependency>

 			<dependency>
 				<groupId>com.google.guava</groupId>