From 59764145bb88b49bc3e11a2749572b529ddc5954 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 25 Jul 2023 17:39:00 +0200 Subject: [PATCH 1/6] cherry picked & fixed commit 270df939c451ff9a5b166b79436c22d9b3609afb --- .../oaf/utils/GraphCleaningFunctions.java | 57 ++++++++++++++++--- .../clean/GraphCleaningFunctionsTest.java | 31 ++++++++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 2 +- .../dhp/oa/graph/clean/result_dataset.json | 28 +++++++++ 4 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index a47b63edb..705967fcf 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -13,11 +13,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; import com.google.common.collect.Sets; @@ -39,6 +35,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST); public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; + private static final String NAME_CLEANING_REGEX = "[\\r\\n\\t\\s]+"; public static T cleanContext(T value, String contextId, String verifyParam) { if (ModelSupport.isSubClass(value, Result.class)) { @@ -247,7 +244,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { if (value instanceof Datasource) { // nothing to evaluate here } else if (value instanceof Project) { - // nothing to evaluate here + final Project p = (Project) value; + return Objects.nonNull(p.getCode()) && StringUtils.isNotBlank(p.getCode().getValue()); } else if (value instanceof Organization) { // nothing to evaluate here } else if (value instanceof Relation) { @@ -294,6 +292,12 @@ public class GraphCleaningFunctions extends CleaningFunctions { } else if (value instanceof Result) { Result r = (Result) value; + if (Objects.nonNull(r.getFulltext()) && (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || + ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { + r.setFulltext(null); + + } + if (Objects.nonNull(r.getDateofacceptance())) { Optional date = cleanDateField(r.getDateofacceptance()); if (date.isPresent()) { @@ -318,8 +322,15 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .collect(Collectors.toList())); } - if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { - r.setPublisher(null); + if (Objects.nonNull(r.getPublisher())) { + if (StringUtils.isBlank(r.getPublisher().getValue())) { + r.setPublisher(null); + } else { + r.getPublisher().setValue( + r.getPublisher().getValue() + .replaceAll(NAME_CLEANING_REGEX, " ") + ); + } } if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { r @@ -486,6 +497,11 @@ public class GraphCleaningFunctions extends CleaningFunctions { i.setDateofacceptance(null); } } + if (StringUtils.isNotBlank(i.getFulltext()) && + (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || + ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { + i.setFulltext(null); + } } } if (Objects.isNull(r.getBestaccessright()) @@ -510,6 +526,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(Objects::nonNull) .filter(a -> StringUtils.isNotBlank(a.getFullname())) .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) + .map(GraphCleaningFunctions::cleanupAuthor) .collect(Collectors.toList())); boolean nullRank = r @@ -604,6 +621,32 @@ public class GraphCleaningFunctions extends CleaningFunctions { return value; } + private static Author cleanupAuthor(Author author) { + if (StringUtils.isNotBlank(author.getFullname())) { + author.setFullname( + author.getFullname() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"") + ); + } + if (StringUtils.isNotBlank(author.getName())) { + author.setName( + author.getName() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"") + ); + } + if (StringUtils.isNotBlank(author.getSurname())) { + author.setSurname( + author.getSurname() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"") + ); + } + + return author; + } + private static Optional cleanDateField(Field dateofacceptance) { return Optional .ofNullable(dateofacceptance) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 24b942f4d..be8307b50 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -251,6 +251,12 @@ public class GraphCleaningFunctionsTest { pid.getQualifier().getClassname())); }); + assertTrue( + p_cleaned + .getAuthor() + .stream() + .anyMatch(a -> "Brien, Tom".equals(a.getFullname()))); + assertNotNull(p_cleaned.getSubject()); List fos_subjects = p_cleaned @@ -285,6 +291,31 @@ public class GraphCleaningFunctionsTest { System.out.println(MAPPER.writeValueAsString(p_cleaned)); } + @Test + void testCleaning_dataset() throws Exception { + + assertNotNull(vocabularies); + assertNotNull(mapping); + + String json = IOUtils + .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json")); + Dataset p_in = MAPPER.readValue(json, Dataset.class); + + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Dataset); + + Dataset p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); + + assertNotNull(p_out); + + assertNotNull(p_out.getPublisher()); + assertNotNull(p_out.getPublisher().getValue()); + + Dataset p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies); + + assertEquals("Best publisher in the world", p_cleaned.getPublisher().getValue()); + } + private static void verify_keyword(Publication p_cleaned, String subject) { Optional s1 = p_cleaned .getSubject() diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 8f35470e1..8ef642dd3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -3,7 +3,7 @@ { "affiliation": [ ], - "fullname": "Brien, Tom", + "fullname": "Brien, Tom", "name": "Tom", "pid": [ { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json new file mode 100644 index 000000000..bec87c7ae --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json @@ -0,0 +1,28 @@ +{ + "resulttype": { + "classid": "dataset", + "classname": "dataset", + "schemeid": "dnet:result_typologies", + "schemename": "dnet:result_typologies" + }, + "fulltext": [ + { + "value" : "https://www.researchgate.net" + } + ], + "publisher" : { + "value" : "Best publisher in the world" + }, + "id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375", + "instance": [ + { + "instancetype": { + "classid": "Comment/debate", + "classname": "Comment/debate", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "fulltext": "https://www.researchgate.net" + } + ] +} \ No newline at end of file From d512df8612ca87765db612974a53d74c351dab80 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 26 Jul 2023 09:14:08 +0200 Subject: [PATCH 2/6] code formatting --- .../oaf/utils/GraphCleaningFunctions.java | 55 +++++++++++-------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 705967fcf..87a43c07e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -292,9 +292,10 @@ public class GraphCleaningFunctions extends CleaningFunctions { } else if (value instanceof Result) { Result r = (Result) value; - if (Objects.nonNull(r.getFulltext()) && (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || - ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { - r.setFulltext(null); + if (Objects.nonNull(r.getFulltext()) + && (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || + ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { + r.setFulltext(null); } @@ -326,10 +327,13 @@ public class GraphCleaningFunctions extends CleaningFunctions { if (StringUtils.isBlank(r.getPublisher().getValue())) { r.setPublisher(null); } else { - r.getPublisher().setValue( - r.getPublisher().getValue() - .replaceAll(NAME_CLEANING_REGEX, " ") - ); + r + .getPublisher() + .setValue( + r + .getPublisher() + .getValue() + .replaceAll(NAME_CLEANING_REGEX, " ")); } } if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { @@ -498,8 +502,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { } } if (StringUtils.isNotBlank(i.getFulltext()) && - (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || - ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { + (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || + ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { i.setFulltext(null); } } @@ -623,25 +627,28 @@ public class GraphCleaningFunctions extends CleaningFunctions { private static Author cleanupAuthor(Author author) { if (StringUtils.isNotBlank(author.getFullname())) { - author.setFullname( - author.getFullname() - .replaceAll(NAME_CLEANING_REGEX, " ") - .replace("\"", "\\\"") - ); + author + .setFullname( + author + .getFullname() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"")); } if (StringUtils.isNotBlank(author.getName())) { - author.setName( - author.getName() - .replaceAll(NAME_CLEANING_REGEX, " ") - .replace("\"", "\\\"") - ); + author + .setName( + author + .getName() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"")); } if (StringUtils.isNotBlank(author.getSurname())) { - author.setSurname( - author.getSurname() - .replaceAll(NAME_CLEANING_REGEX, " ") - .replace("\"", "\\\"") - ); + author + .setSurname( + author + .getSurname() + .replaceAll(NAME_CLEANING_REGEX, " ") + .replace("\"", "\\\"")); } return author; From ccac6a7f7509f1dae699db886ca4cd5bc19a1593 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 31 Jul 2023 12:35:05 +0200 Subject: [PATCH 3/6] rule out records with NULL dataInfo --- .../dhp/schema/oaf/utils/GraphCleaningFunctions.java | 2 +- .../dhp/oa/graph/clean/GraphCleaningFunctionsTest.java | 9 +++++++++ .../resources/eu/dnetlib/dhp/oa/graph/clean/project.json | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/project.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 87a43c07e..831035d94 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -236,7 +236,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { d -> Optional .ofNullable(d.getInvisible()) .orElse(true)) - .orElse(true)) + .orElse(false)) .orElse(true))) { return true; } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index be8307b50..1492c60fe 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -368,6 +368,15 @@ public class GraphCleaningFunctionsTest { Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned)); } + @Test + public void testFilterProject() throws IOException { + String json = IOUtils + .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/project.json")); + Project p_in = MAPPER.readValue(json, Project.class); + + Assertions.assertEquals(false, GraphCleaningFunctions.filter(p_in)); + } + @Test public void testCleanDoiBoost2() throws IOException { String json = IOUtils diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/project.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/project.json new file mode 100644 index 000000000..9ab714376 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/project.json @@ -0,0 +1 @@ +{"measures": [{"id": "downloads", "unit": [{"dataInfo": {"provenanceaction": {"classid": "measure:usage_counts", "classname": "measure:usage_counts", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "count", "value": "1"}]}, {"id": "views", "unit": [{"dataInfo": {"provenanceaction": {"classid": "measure:usage_counts", "classname": "measure:usage_counts", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "count", "value": "0"}]}], "id": "40|aka_________::591da07706352f1195afaeed4065f52e"} \ No newline at end of file From da1727f93fcca72ebb02f687f238aa09d95d1a68 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 31 Jul 2023 17:52:56 +0200 Subject: [PATCH 4/6] rule out records with NULL dataInfo, except for Relations --- .../oaf/utils/GraphCleaningFunctions.java | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 831035d94..1bfb54278 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -225,20 +225,20 @@ public class GraphCleaningFunctions extends CleaningFunctions { } public static boolean filter(T value) { - if (Boolean.TRUE - .equals( - Optional - .ofNullable(value) - .map( - o -> Optional - .ofNullable(o.getDataInfo()) - .map( - d -> Optional - .ofNullable(d.getInvisible()) - .orElse(true)) - .orElse(false)) - .orElse(true))) { - return true; + if (!(value instanceof Relation) && (Boolean.TRUE + .equals( + Optional + .ofNullable(value) + .map( + o -> Optional + .ofNullable(o.getDataInfo()) + .map( + d -> Optional + .ofNullable(d.getInvisible()) + .orElse(true)) + .orElse(false)) + .orElse(true)))) { + return true; } if (value instanceof Datasource) { From 7180911ded7aaa93bf6335d2ff3da444c4c6ea70 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 23 Jun 2023 16:10:49 +0200 Subject: [PATCH 5/6] [graph cleaning] fixed regex behaviour for cleaning ROR and GRID identifiers, added tests --- .../schema/oaf/utils/FundRefCleaningRule.java | 10 ++++++---- .../schema/oaf/utils/GridCleaningRule.java | 10 ++++++++-- .../schema/oaf/utils/ISNICleaningRule.java | 6 ++++-- .../dhp/schema/oaf/utils/PICCleaningRule.java | 6 ++++-- .../dhp/schema/oaf/utils/PmcCleaningRule.java | 13 ++++++++++++- .../schema/oaf/utils/PmidCleaningRule.java | 17 +++++++++++++---- .../dhp/schema/oaf/utils/RorCleaningRule.java | 13 +++++++++++-- .../oaf/utils/GridCleaningRuleTest.java | 18 ++++++++++++++++++ .../oaf/utils/ISNICleaningRuleTest.java | 19 +++++++++++++++++++ .../schema/oaf/utils/PICCleaningRuleTest.java | 19 +++++++++++++++++++ .../schema/oaf/utils/PmcCleaningRuleTest.java | 19 +++++++++++++++++++ .../oaf/utils/PmidCleaningRuleTest.java | 18 ++++++++++++++++++ .../schema/oaf/utils/RorCleaningRuleTest.java | 17 +++++++++++++++++ 13 files changed, 168 insertions(+), 17 deletions(-) create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRuleTest.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java index 7f6303825..a267b8b88 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java @@ -6,14 +6,16 @@ import java.util.regex.Pattern; public class FundRefCleaningRule { - public static String clean(final String fundrefId) { + public static final Pattern PATTERN = Pattern.compile("\\d+"); - String s = fundrefId + public static String clean(final String fundRefId) { + + String s = fundRefId .toLowerCase() .replaceAll("\\s", ""); - Matcher m = Pattern.compile("\\d+").matcher(s); - if (m.matches()) { + Matcher m = PATTERN.matcher(s); + if (m.find()) { return m.group(); } else { return ""; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java index ff45d6a0d..37ab91dd5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java @@ -6,13 +6,19 @@ import java.util.regex.Pattern; public class GridCleaningRule { + public static final Pattern PATTERN = Pattern.compile("(?\\d{4,6}\\.[0-9a-z]{1,2})"); + public static String clean(String grid) { String s = grid .replaceAll("\\s", "") .toLowerCase(); - Matcher m = Pattern.compile("\\d{4,6}\\.[0-9a-z]{1,2}").matcher(s); - return m.matches() ? "grid." + m.group() : ""; + Matcher m = PATTERN.matcher(s); + if (m.find()) { + return "grid." + m.group("grid"); + } + + return ""; } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java index 5bc49c453..bcd8279cc 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java @@ -7,10 +7,12 @@ import java.util.regex.Pattern; // https://www.wikidata.org/wiki/Property:P213 public class ISNICleaningRule { + public static final Pattern PATTERN = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])"); + public static String clean(final String isni) { - Matcher m = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])").matcher(isni); - if (m.matches()) { + Matcher m = PATTERN.matcher(isni); + if (m.find()) { return String.join("", m.group(1), m.group(2), m.group(3), m.group(4)); } else { return ""; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java index 83b9a1f9f..a2213ed9f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java @@ -6,10 +6,12 @@ import java.util.regex.Pattern; public class PICCleaningRule { + public static final Pattern PATTERN = Pattern.compile("\\d{9}"); + public static String clean(final String pic) { - Matcher m = Pattern.compile("\\d{9}").matcher(pic); - if (m.matches()) { + Matcher m = PATTERN.matcher(pic); + if (m.find()) { return m.group(); } else { return ""; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java index 4e1205805..903041d43 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java @@ -1,13 +1,24 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + public class PmcCleaningRule { + public static final Pattern PATTERN = Pattern.compile("PMC\\d{1,8}"); + public static String clean(String pmc) { String s = pmc .replaceAll("\\s", "") .toUpperCase(); - return s.matches("^PMC\\d{1,8}$") ? s : ""; + + final Matcher m = PATTERN.matcher(s); + + if (m.find()) { + return m.group(); + } + return ""; } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java index 65833a594..d0f5a3b27 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java @@ -1,16 +1,25 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + // https://researchguides.stevens.edu/c.php?g=442331&p=6577176 public class PmidCleaningRule { + public static final Pattern PATTERN = Pattern.compile("[1-9]{1,8}"); + public static String clean(String pmid) { String s = pmid .toLowerCase() - .replaceAll("\\s", "") - .trim() - .replaceAll("^0+", ""); - return s.matches("^\\d{1,8}$") ? s : ""; + .replaceAll("\\s", ""); + + final Matcher m = PATTERN.matcher(s); + + if (m.find()) { + return m.group(); + } + return ""; } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java index f40cdb00c..f6685f19d 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java @@ -7,12 +7,21 @@ import java.util.regex.Pattern; // https://ror.readme.io/docs/ror-identifier-pattern public class RorCleaningRule { + public static final String ROR_PREFIX = "https://ror.org/"; + + private static final Pattern PATTERN = Pattern.compile("(?0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2})"); + public static String clean(String ror) { String s = ror .replaceAll("\\s", "") .toLowerCase(); - Matcher m = Pattern.compile("0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}").matcher(s); - return m.matches() ? "https://ror.org/" + m.group() : ""; + + Matcher m = PATTERN.matcher(s); + + if (m.find()) { + return ROR_PREFIX + m.group("ror"); + } + return ""; } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRuleTest.java new file mode 100644 index 000000000..1b9163d46 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRuleTest.java @@ -0,0 +1,18 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class GridCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("grid.493784.5", GridCleaningRule.clean("grid.493784.5")); + assertEquals("grid.493784.5x", GridCleaningRule.clean("grid.493784.5x")); + assertEquals("grid.493784.5x", GridCleaningRule.clean("493784.5x")); + assertEquals("", GridCleaningRule.clean("493x784.5x")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRuleTest.java new file mode 100644 index 000000000..e51d1e05c --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRuleTest.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class ISNICleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("0000000463436020", ISNICleaningRule.clean("0000 0004 6343 6020")); + assertEquals("0000000463436020", ISNICleaningRule.clean("0000000463436020")); + assertEquals("", ISNICleaningRule.clean("Q30256598")); + assertEquals("0000000493403529", ISNICleaningRule.clean("ISNI:0000000493403529")); + assertEquals("000000008614884X", ISNICleaningRule.clean("0000 0000 8614 884X")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRuleTest.java new file mode 100644 index 000000000..3736033c3 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRuleTest.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class PICCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("887624982", PICCleaningRule.clean("887624982")); + assertEquals("", PICCleaningRule.clean("887 624982")); + assertEquals("887624982", PICCleaningRule.clean(" 887624982 ")); + assertEquals("887624982", PICCleaningRule.clean(" 887624982x ")); + assertEquals("887624982", PICCleaningRule.clean(" 88762498200 ")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRuleTest.java new file mode 100644 index 000000000..e53ebae89 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRuleTest.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class PmcCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("PMC1234", PmcCleaningRule.clean("PMC1234")); + assertEquals("PMC1234", PmcCleaningRule.clean(" PMC1234")); + assertEquals("PMC12345678", PmcCleaningRule.clean("PMC12345678")); + assertEquals("PMC12345678", PmcCleaningRule.clean("PMC123456789")); + assertEquals("PMC12345678", PmcCleaningRule.clean("PMC 12345678")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java new file mode 100644 index 000000000..9562adf7e --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java @@ -0,0 +1,18 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class PmidCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("1234", PmidCleaningRule.clean("01234")); + assertEquals("1234567", PmidCleaningRule.clean("0123 4567")); + assertEquals("123", PmidCleaningRule.clean("0123x4567")); + assertEquals("", PmidCleaningRule.clean("abc")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRuleTest.java new file mode 100644 index 000000000..5d5c03959 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRuleTest.java @@ -0,0 +1,17 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class RorCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("https://ror.org/05rpz9w55", RorCleaningRule.clean("https://ror.org/05rpz9w55")); + assertEquals("https://ror.org/05rpz9w55", RorCleaningRule.clean("05rpz9w55")); + assertEquals("", RorCleaningRule.clean("05rpz9w_55")); + } + +} From 0bc74e2000e28b2391e39f4bca884a084dc75c72 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 2 Aug 2023 11:52:10 +0200 Subject: [PATCH 6/6] code formatting --- .../oaf/utils/GraphCleaningFunctions.java | 26 +++++++++---------- .../clean/GraphCleaningFunctionsTest.java | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 1bfb54278..8afa41f95 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -226,19 +226,19 @@ public class GraphCleaningFunctions extends CleaningFunctions { public static boolean filter(T value) { if (!(value instanceof Relation) && (Boolean.TRUE - .equals( - Optional - .ofNullable(value) - .map( - o -> Optional - .ofNullable(o.getDataInfo()) - .map( - d -> Optional - .ofNullable(d.getInvisible()) - .orElse(true)) - .orElse(false)) - .orElse(true)))) { - return true; + .equals( + Optional + .ofNullable(value) + .map( + o -> Optional + .ofNullable(o.getDataInfo()) + .map( + d -> Optional + .ofNullable(d.getInvisible()) + .orElse(true)) + .orElse(false)) + .orElse(true)))) { + return true; } if (value instanceof Datasource) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 1492c60fe..8d10508a9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -371,7 +371,7 @@ public class GraphCleaningFunctionsTest { @Test public void testFilterProject() throws IOException { String json = IOUtils - .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/project.json")); + .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/project.json")); Project p_in = MAPPER.readValue(json, Project.class); Assertions.assertEquals(false, GraphCleaningFunctions.filter(p_in));