From 0f5a819f44a519028c45cd5a73224f2bff9674d9 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 23 Jun 2023 16:10:49 +0200 Subject: [PATCH] [graph cleaning] fixed regex behaviour for cleaning ROR and GRID identifiers, added tests --- .../schema/oaf/utils/FundRefCleaningRule.java | 10 ++++++---- .../schema/oaf/utils/GridCleaningRule.java | 10 ++++++++-- .../schema/oaf/utils/ISNICleaningRule.java | 6 ++++-- .../dhp/schema/oaf/utils/PICCleaningRule.java | 6 ++++-- .../dhp/schema/oaf/utils/PmcCleaningRule.java | 13 ++++++++++++- .../schema/oaf/utils/PmidCleaningRule.java | 17 +++++++++++++---- .../dhp/schema/oaf/utils/RorCleaningRule.java | 13 +++++++++++-- .../oaf/utils/GridCleaningRuleTest.java | 18 ++++++++++++++++++ .../oaf/utils/ISNICleaningRuleTest.java | 19 +++++++++++++++++++ .../schema/oaf/utils/PICCleaningRuleTest.java | 19 +++++++++++++++++++ .../schema/oaf/utils/PmcCleaningRuleTest.java | 19 +++++++++++++++++++ .../oaf/utils/PmidCleaningRuleTest.java | 18 ++++++++++++++++++ .../schema/oaf/utils/RorCleaningRuleTest.java | 17 +++++++++++++++++ 13 files changed, 168 insertions(+), 17 deletions(-) create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRuleTest.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java index 7f63038257..a267b8b88e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java @@ -6,14 +6,16 @@ import java.util.regex.Pattern; public class FundRefCleaningRule { - public static String clean(final String fundrefId) { + public static final Pattern PATTERN = Pattern.compile("\\d+"); - String s = fundrefId + public static String clean(final String fundRefId) { + + String s = fundRefId .toLowerCase() .replaceAll("\\s", ""); - Matcher m = Pattern.compile("\\d+").matcher(s); - if (m.matches()) { + Matcher m = PATTERN.matcher(s); + if (m.find()) { return m.group(); } else { return ""; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java index ff45d6a0d3..37ab91dd52 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java @@ -6,13 +6,19 @@ import java.util.regex.Pattern; public class GridCleaningRule { + public static final Pattern PATTERN = Pattern.compile("(?\\d{4,6}\\.[0-9a-z]{1,2})"); + public static String clean(String grid) { String s = grid .replaceAll("\\s", "") .toLowerCase(); - Matcher m = Pattern.compile("\\d{4,6}\\.[0-9a-z]{1,2}").matcher(s); - return m.matches() ? "grid." + m.group() : ""; + Matcher m = PATTERN.matcher(s); + if (m.find()) { + return "grid." + m.group("grid"); + } + + return ""; } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java index 5bc49c453a..bcd8279cc3 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java @@ -7,10 +7,12 @@ import java.util.regex.Pattern; // https://www.wikidata.org/wiki/Property:P213 public class ISNICleaningRule { + public static final Pattern PATTERN = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])"); + public static String clean(final String isni) { - Matcher m = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])").matcher(isni); - if (m.matches()) { + Matcher m = PATTERN.matcher(isni); + if (m.find()) { return String.join("", m.group(1), m.group(2), m.group(3), m.group(4)); } else { return ""; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java index 83b9a1f9f3..a2213ed9f5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java @@ -6,10 +6,12 @@ import java.util.regex.Pattern; public class PICCleaningRule { + public static final Pattern PATTERN = Pattern.compile("\\d{9}"); + public static String clean(final String pic) { - Matcher m = Pattern.compile("\\d{9}").matcher(pic); - if (m.matches()) { + Matcher m = PATTERN.matcher(pic); + if (m.find()) { return m.group(); } else { return ""; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java index 4e12058055..903041d436 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java @@ -1,13 +1,24 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + public class PmcCleaningRule { + public static final Pattern PATTERN = Pattern.compile("PMC\\d{1,8}"); + public static String clean(String pmc) { String s = pmc .replaceAll("\\s", "") .toUpperCase(); - return s.matches("^PMC\\d{1,8}$") ? s : ""; + + final Matcher m = PATTERN.matcher(s); + + if (m.find()) { + return m.group(); + } + return ""; } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java index 65833a594a..d0f5a3b27e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java @@ -1,16 +1,25 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + // https://researchguides.stevens.edu/c.php?g=442331&p=6577176 public class PmidCleaningRule { + public static final Pattern PATTERN = Pattern.compile("[1-9]{1,8}"); + public static String clean(String pmid) { String s = pmid .toLowerCase() - .replaceAll("\\s", "") - .trim() - .replaceAll("^0+", ""); - return s.matches("^\\d{1,8}$") ? s : ""; + .replaceAll("\\s", ""); + + final Matcher m = PATTERN.matcher(s); + + if (m.find()) { + return m.group(); + } + return ""; } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java index f40cdb00c4..f6685f19da 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java @@ -7,12 +7,21 @@ import java.util.regex.Pattern; // https://ror.readme.io/docs/ror-identifier-pattern public class RorCleaningRule { + public static final String ROR_PREFIX = "https://ror.org/"; + + private static final Pattern PATTERN = Pattern.compile("(?0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2})"); + public static String clean(String ror) { String s = ror .replaceAll("\\s", "") .toLowerCase(); - Matcher m = Pattern.compile("0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}").matcher(s); - return m.matches() ? "https://ror.org/" + m.group() : ""; + + Matcher m = PATTERN.matcher(s); + + if (m.find()) { + return ROR_PREFIX + m.group("ror"); + } + return ""; } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRuleTest.java new file mode 100644 index 0000000000..1b9163d464 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRuleTest.java @@ -0,0 +1,18 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class GridCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("grid.493784.5", GridCleaningRule.clean("grid.493784.5")); + assertEquals("grid.493784.5x", GridCleaningRule.clean("grid.493784.5x")); + assertEquals("grid.493784.5x", GridCleaningRule.clean("493784.5x")); + assertEquals("", GridCleaningRule.clean("493x784.5x")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRuleTest.java new file mode 100644 index 0000000000..e51d1e05c9 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRuleTest.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class ISNICleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("0000000463436020", ISNICleaningRule.clean("0000 0004 6343 6020")); + assertEquals("0000000463436020", ISNICleaningRule.clean("0000000463436020")); + assertEquals("", ISNICleaningRule.clean("Q30256598")); + assertEquals("0000000493403529", ISNICleaningRule.clean("ISNI:0000000493403529")); + assertEquals("000000008614884X", ISNICleaningRule.clean("0000 0000 8614 884X")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRuleTest.java new file mode 100644 index 0000000000..3736033c33 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRuleTest.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class PICCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("887624982", PICCleaningRule.clean("887624982")); + assertEquals("", PICCleaningRule.clean("887 624982")); + assertEquals("887624982", PICCleaningRule.clean(" 887624982 ")); + assertEquals("887624982", PICCleaningRule.clean(" 887624982x ")); + assertEquals("887624982", PICCleaningRule.clean(" 88762498200 ")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRuleTest.java new file mode 100644 index 0000000000..e53ebae897 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRuleTest.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class PmcCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("PMC1234", PmcCleaningRule.clean("PMC1234")); + assertEquals("PMC1234", PmcCleaningRule.clean(" PMC1234")); + assertEquals("PMC12345678", PmcCleaningRule.clean("PMC12345678")); + assertEquals("PMC12345678", PmcCleaningRule.clean("PMC123456789")); + assertEquals("PMC12345678", PmcCleaningRule.clean("PMC 12345678")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java new file mode 100644 index 0000000000..9562adf7e0 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java @@ -0,0 +1,18 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class PmidCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("1234", PmidCleaningRule.clean("01234")); + assertEquals("1234567", PmidCleaningRule.clean("0123 4567")); + assertEquals("123", PmidCleaningRule.clean("0123x4567")); + assertEquals("", PmidCleaningRule.clean("abc")); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRuleTest.java new file mode 100644 index 0000000000..5d5c03959d --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRuleTest.java @@ -0,0 +1,17 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class RorCleaningRuleTest { + + @Test + void testCleaning() { + assertEquals("https://ror.org/05rpz9w55", RorCleaningRule.clean("https://ror.org/05rpz9w55")); + assertEquals("https://ror.org/05rpz9w55", RorCleaningRule.clean("05rpz9w55")); + assertEquals("", RorCleaningRule.clean("05rpz9w_55")); + } + +}