From 6dddad86ee10543a2d96e3fe7a555bd287492e0c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 28 Jul 2021 16:21:29 +0200 Subject: [PATCH] [cleaning] title cleaning based on the me.xuender:unidecode library --- dhp-common/pom.xml | 5 ++ .../oaf/utils/GraphCleaningFunctions.java | 19 ++--- .../schema/oaf/utils/OafMapperUtilsTest.java | 25 +++++-- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 26 +++++++ .../eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml | 70 +++++++++++++++++++ pom.xml | 5 ++ 6 files changed, 136 insertions(+), 14 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 74f31cf35..4c7810c47 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -25,6 +25,11 @@ com.github.sisyphsu dateparser + + me.xuender + unidecode + + org.apache.spark spark-core_2.11 diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index e5181b111..1d002ed7e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; import java.util.*; import java.util.function.Function; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; -import org.jetbrains.annotations.NotNull; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; -import com.google.common.collect.Maps; import com.google.common.collect.Sets; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import me.xuender.unidecode.Unidecode; public class GraphCleaningFunctions extends CleaningFunctions { @@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter( - sp -> sp - .getValue() - .toLowerCase() - .replaceAll(TITLE_FILTER_REGEX, "") - .length() > TITLE_FILTER_RESIDUAL_LENGTH) + sp -> { + final String title = sp + .getValue() + .toLowerCase(); + final String residual = Unidecode + .decode(title) + .replaceAll(TITLE_FILTER_REGEX, ""); + return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH; + }) .map(GraphCleaningFunctions::cleanValue) .collect(Collectors.toList())); } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java index eefa1e9a3..8d519a93f 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils; import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; -import java.time.LocalDate; -import java.time.format.DateTimeFormatter; import java.util.HashSet; import java.util.List; -import java.util.Locale; -import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; +import me.xuender.unidecode.Unidecode; public class OafMapperUtilsTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + @Test + public void testUnidecode() { + + assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ")); + assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛")); + assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼")); + assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい")); + assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի")); + assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики")); + assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ")); + assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης")); + assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية")); + assertEquals("abc def ghi", Unidecode.decode("abc def ghi")); + } + @Test public void testDateValidation() { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 63f18a803..ba4211a3f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.graph.raw; +import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup; +import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames; import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.lenient; @@ -640,6 +642,30 @@ public class MappersTest { System.out.println(p.getTitle().get(0).getValue()); } + @Test + void testJairo() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml")); + final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + final Publication p = (Publication) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + + assertNotNull(p.getTitle()); + assertFalse(p.getTitle().isEmpty()); + assertTrue(p.getTitle().size() == 1); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + + final Publication p_cleaned = cleanup(fixVocabularyNames(p)); + + assertNotNull(p_cleaned.getTitle()); + assertFalse(p_cleaned.getTitle().isEmpty()); + } + @Test void testOdfFromHdfs() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml")); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml new file mode 100644 index 000000000..9ec696256 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml @@ -0,0 +1,70 @@ + + +
+ jairo_______::000012e58ed836576ef2a0d38b0f726f + oai:irdb.nii.ac.jp:01221:0000010198 + + + + + + 2021-05-10T11:31:09.424Z + 2021-06-03T01:45:42.536Z + jairo_______ +
+ + 多項式GCDを用いた復号法に関する研究 + 上原, 剛 + 甲斐, 博 + 野田, 松太郎 + application/pdf + http://hdl.handle.net/2433/25934 + jpn + 京都大学数理解析研究所 + 410 + Departmental Bulletin Paper + 0014 + 2004-10-01 + + openaire____::554c7c2873 + OPEN + + + 2433/25934 + AN00061013 + http://hdl.handle.net/2433/25934 + http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf + 数理解析研究所講究録 + + + + + https%3A%2F%2Firdb.nii.ac.jp%2Foai + oai:irdb.nii.ac.jp:01221:0000010198 + 2021-04-13T13:36:29Z + + + http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request + oai:repository.kulib.kyoto-u.ac.jp:2433/25934 + 2012-07-12T14:15:41Z + http://irdb.nii.ac.jp/oai + + + + + false + false + 0.9 + + + + +
\ No newline at end of file diff --git a/pom.xml b/pom.xml index 6e4526e41..fc4a8a21b 100644 --- a/pom.xml +++ b/pom.xml @@ -205,6 +205,11 @@ dateparser 1.0.7
+ + me.xuender + unidecode + 0.0.7 + com.google.guava