From 82a4e4efaedee1e3796f0edf783a40c97339734d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 17 Nov 2021 14:17:22 +0100 Subject: [PATCH] [cleaning wf] fixed methodology to rule out invalid result titles, based on https://support.openaire.eu/issues/7206 --- .../oaf/utils/GraphCleaningFunctions.java | 19 ++++-- .../clean/GraphCleaningFunctionsTest.java | 15 +++- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 68 ++++++++++++++++++- 3 files changed, 95 insertions(+), 7 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 43413b311e..592580ab8b 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -27,7 +27,10 @@ public class GraphCleaningFunctions extends CleaningFunctions { public static final int ORCID_LEN = 19; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; - public static final String TITLE_FILTER_REGEX = "(test)|\\W|\\d"; + + public static final String TITLE_TEST = "test"; + public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST); + public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; public static T fixVocabularyNames(T value) { @@ -195,10 +198,16 @@ public class GraphCleaningFunctions extends CleaningFunctions { final String title = sp .getValue() .toLowerCase(); - final String residual = Unidecode - .decode(title) - .replaceAll(TITLE_FILTER_REGEX, ""); - return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH; + final String decoded = Unidecode.decode(title); + + if (StringUtils.contains(decoded, TITLE_TEST)) { + return decoded + .replaceAll(TITLE_FILTER_REGEX, "") + .length() > TITLE_FILTER_RESIDUAL_LENGTH; + } + return !decoded + .replaceAll("\\W|\\d", "") + .isEmpty(); }) .map(GraphCleaningFunctions::cleanValue) .collect(Collectors.toList())); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 42d9f226cd..aa9535ef7f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.io.IOUtils; @@ -137,9 +138,21 @@ public class GraphCleaningFunctionsTest { .stream() .anyMatch(s -> s.getValue().equals("10.1009/qwerty"))); + assertEquals(5, p_out.getTitle().size()); + Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out); - assertEquals(1, p_cleaned.getTitle().size()); + assertEquals(3, p_cleaned.getTitle().size()); + + List titles = p_cleaned + .getTitle() + .stream() + .map(StructuredProperty::getValue) + .collect(Collectors.toList()); + assertTrue(titles.contains("omic")); + assertTrue( + titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test")); + assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳")); assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid()); assertNull(p_out.getPublisher()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 6795ccf1ba..b3e3024746 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -864,7 +864,7 @@ "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title" }, - "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers" + "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers test" }, { "dataInfo": { @@ -887,6 +887,72 @@ "schemename": "dnet:dataCite_title" }, "value": "test test 123 test" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "omic" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "-" } ] } \ No newline at end of file