diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 74f31cf35..4c7810c47 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -25,6 +25,11 @@
com.github.sisyphsu
dateparser
+
+ me.xuender
+ unidecode
+
+
org.apache.spark
spark-core_2.11
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
index e5181b111..1d002ed7e 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.*;
import java.util.function.Function;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
-import org.jetbrains.annotations.NotNull;
import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
+import me.xuender.unidecode.Unidecode;
public class GraphCleaningFunctions extends CleaningFunctions {
@@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(
- sp -> sp
- .getValue()
- .toLowerCase()
- .replaceAll(TITLE_FILTER_REGEX, "")
- .length() > TITLE_FILTER_RESIDUAL_LENGTH)
+ sp -> {
+ final String title = sp
+ .getValue()
+ .toLowerCase();
+ final String residual = Unidecode
+ .decode(title)
+ .replaceAll(TITLE_FILTER_REGEX, "");
+ return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
+ })
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
index eefa1e9a3..8d519a93f 100644
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
@@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
-import java.time.LocalDate;
-import java.time.format.DateTimeFormatter;
import java.util.HashSet;
import java.util.List;
-import java.util.Locale;
-import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
@@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import me.xuender.unidecode.Unidecode;
public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+ @Test
+ public void testUnidecode() {
+
+ assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
+ assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
+ assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
+ assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
+ assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
+ assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
+ assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
+ assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
+ assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
+ assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
+ }
+
@Test
public void testDateValidation() {
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
index 63f18a803..ba4211a3f 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@@ -1,6 +1,8 @@
package eu.dnetlib.dhp.oa.graph.raw;
+import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup;
+import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
@@ -640,6 +642,30 @@ public class MappersTest {
System.out.println(p.getTitle().get(0).getValue());
}
+ @Test
+ void testJairo() throws IOException {
+ final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml"));
+ final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
+
+ System.out.println("***************");
+ System.out.println(new ObjectMapper().writeValueAsString(list));
+ System.out.println("***************");
+
+ final Publication p = (Publication) list.get(0);
+ assertValidId(p.getId());
+ assertValidId(p.getCollectedfrom().get(0).getKey());
+
+ assertNotNull(p.getTitle());
+ assertFalse(p.getTitle().isEmpty());
+ assertTrue(p.getTitle().size() == 1);
+ assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
+
+ final Publication p_cleaned = cleanup(fixVocabularyNames(p));
+
+ assertNotNull(p_cleaned.getTitle());
+ assertFalse(p_cleaned.getTitle().isEmpty());
+ }
+
@Test
void testOdfFromHdfs() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml
new file mode 100644
index 000000000..9ec696256
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml
@@ -0,0 +1,70 @@
+
+
+
+ jairo_______::000012e58ed836576ef2a0d38b0f726f
+ oai:irdb.nii.ac.jp:01221:0000010198
+
+
+
+
+
+ 2021-05-10T11:31:09.424Z
+ 2021-06-03T01:45:42.536Z
+ jairo_______
+
+
+ 多項式GCDを用いた復号法に関する研究
+ 上原, 剛
+ 甲斐, 博
+ 野田, 松太郎
+ application/pdf
+ http://hdl.handle.net/2433/25934
+ jpn
+ 京都大学数理解析研究所
+ 410
+ Departmental Bulletin Paper
+ 0014
+ 2004-10-01
+
+ openaire____::554c7c2873
+ OPEN
+
+
+ 2433/25934
+ AN00061013
+ http://hdl.handle.net/2433/25934
+ http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf
+ 数理解析研究所講究録
+
+
+
+
+ https%3A%2F%2Firdb.nii.ac.jp%2Foai
+ oai:irdb.nii.ac.jp:01221:0000010198
+ 2021-04-13T13:36:29Z
+
+
+ http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request
+ oai:repository.kulib.kyoto-u.ac.jp:2433/25934
+ 2012-07-12T14:15:41Z
+ http://irdb.nii.ac.jp/oai
+
+
+
+
+ false
+ false
+ 0.9
+
+
+
+
+
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 6e4526e41..fc4a8a21b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -205,6 +205,11 @@
dateparser
1.0.7
+
+ me.xuender
+ unidecode
+ 0.0.7
+
com.google.guava