From 270df939c451ff9a5b166b79436c22d9b3609afb Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 25 Jul 2023 17:29:50 +0200 Subject: [PATCH 1/2] partial implementation of the suggestions from https://support.openaire.eu/issues/8898 --- .../oaf/utils/GraphCleaningFunctions.java | 57 ++++++++++++++++--- .../clean/GraphCleaningFunctionsTest.java | 31 ++++++++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 2 +- .../dhp/oa/graph/clean/result_dataset.json | 28 +++++++++ 4 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index a47b63edb..dff794c0d 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -13,11 +13,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; import com.google.common.collect.Sets; @@ -39,6 +35,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST); public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; + private static final String NAME_CLEANING_REGEX = "[\\r\\n\\t\\s]+"; public static T cleanContext(T value, String contextId, String verifyParam) { if (ModelSupport.isSubClass(value, Result.class)) { @@ -247,7 +244,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { if (value instanceof Datasource) { // nothing to evaluate here } else if (value instanceof Project) { - // nothing to evaluate here + final Project p = (Project) value; + return Objects.isNull(p.getCode()) || StringUtils.isBlank(p.getCode().getValue()); } else if (value instanceof Organization) { // nothing to evaluate here } else if (value instanceof Relation) { @@ -294,6 +292,12 @@ public class GraphCleaningFunctions extends CleaningFunctions { } else if (value instanceof Result) { Result r = (Result) value; + if (Objects.nonNull(r.getFulltext()) && (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || + ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { + r.setFulltext(null); + + } + if (Objects.nonNull(r.getDateofacceptance())) { Optional date = cleanDateField(r.getDateofacceptance()); if (date.isPresent()) { @@ -318,8 +322,15 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .collect(Collectors.toList())); } - if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { - r.setPublisher(null); + if (Objects.nonNull(r.getPublisher())) { + if (StringUtils.isBlank(r.getPublisher().getValue())) { + r.setPublisher(null); + } else { + r.getPublisher().setValue( + r.getPublisher().getValue() + .replaceAll(NAME_CLEANING_REGEX, " ") + ); + } } if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { r @@ -486,6 +497,11 @@ public class GraphCleaningFunctions extends CleaningFunctions { i.setDateofacceptance(null); } } + if (StringUtils.isNotBlank(i.getFulltext()) && + (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || + ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { + i.setFulltext(null); + } } } if (Objects.isNull(r.getBestaccessright()) @@ -510,6 +526,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(Objects::nonNull) .filter(a -> StringUtils.isNotBlank(a.getFullname())) .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) + .map(GraphCleaningFunctions::cleanupAuthor) .collect(Collectors.toList())); boolean nullRank = r @@ -604,6 +621,32 @@ public class GraphCleaningFunctions extends CleaningFunctions { return value; } + private static Author cleanupAuthor(Author author) { + if (StringUtils.isNotBlank(author.getFullname())) { + author.setFullname( + author.getFullname() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"") + ); + } + if (StringUtils.isNotBlank(author.getName())) { + author.setName( + author.getName() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"") + ); + } + if (StringUtils.isNotBlank(author.getSurname())) { + author.setSurname( + author.getSurname() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"") + ); + } + + return author; + } + private static Optional cleanDateField(Field dateofacceptance) { return Optional .ofNullable(dateofacceptance) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 24b942f4d..be8307b50 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -251,6 +251,12 @@ public class GraphCleaningFunctionsTest { pid.getQualifier().getClassname())); }); + assertTrue( + p_cleaned + .getAuthor() + .stream() + .anyMatch(a -> "Brien, Tom".equals(a.getFullname()))); + assertNotNull(p_cleaned.getSubject()); List fos_subjects = p_cleaned @@ -285,6 +291,31 @@ public class GraphCleaningFunctionsTest { System.out.println(MAPPER.writeValueAsString(p_cleaned)); } + @Test + void testCleaning_dataset() throws Exception { + + assertNotNull(vocabularies); + assertNotNull(mapping); + + String json = IOUtils + .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json")); + Dataset p_in = MAPPER.readValue(json, Dataset.class); + + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Dataset); + + Dataset p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); + + assertNotNull(p_out); + + assertNotNull(p_out.getPublisher()); + assertNotNull(p_out.getPublisher().getValue()); + + Dataset p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies); + + assertEquals("Best publisher in the world", p_cleaned.getPublisher().getValue()); + } + private static void verify_keyword(Publication p_cleaned, String subject) { Optional s1 = p_cleaned .getSubject() diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 8f35470e1..8ef642dd3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -3,7 +3,7 @@ { "affiliation": [ ], - "fullname": "Brien, Tom", + "fullname": "Brien, Tom", "name": "Tom", "pid": [ { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json new file mode 100644 index 000000000..bec87c7ae --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json @@ -0,0 +1,28 @@ +{ + "resulttype": { + "classid": "dataset", + "classname": "dataset", + "schemeid": "dnet:result_typologies", + "schemename": "dnet:result_typologies" + }, + "fulltext": [ + { + "value" : "https://www.researchgate.net" + } + ], + "publisher" : { + "value" : "Best publisher in the world" + }, + "id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375", + "instance": [ + { + "instancetype": { + "classid": "Comment/debate", + "classname": "Comment/debate", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "fulltext": "https://www.researchgate.net" + } + ] +} \ No newline at end of file From d8435a6512f201189b415391b7de3d8ca3ee7b90 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 25 Jul 2023 17:39:57 +0200 Subject: [PATCH 2/2] inverted condition --- .../eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index dff794c0d..705967fcf 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -245,7 +245,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { // nothing to evaluate here } else if (value instanceof Project) { final Project p = (Project) value; - return Objects.isNull(p.getCode()) || StringUtils.isBlank(p.getCode().getValue()); + return Objects.nonNull(p.getCode()) && StringUtils.isNotBlank(p.getCode().getValue()); } else if (value instanceof Organization) { // nothing to evaluate here } else if (value instanceof Relation) {