From 2c235e82ad8aa46e01b365f0f077279fba40290e Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 6 Oct 2023 12:35:54 +0200 Subject: [PATCH] Fix cleaning of Pmid where parsing of numbers stopped at first not leading 0' character --- .../eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java | 4 ++-- .../dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java index d0f5a3b27..c0c451b88 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java @@ -7,7 +7,7 @@ import java.util.regex.Pattern; // https://researchguides.stevens.edu/c.php?g=442331&p=6577176 public class PmidCleaningRule { - public static final Pattern PATTERN = Pattern.compile("[1-9]{1,8}"); + public static final Pattern PATTERN = Pattern.compile("0*(\\d{1,8})"); public static String clean(String pmid) { String s = pmid @@ -17,7 +17,7 @@ public class PmidCleaningRule { final Matcher m = PATTERN.matcher(s); if (m.find()) { - return m.group(); + return m.group(1); } return ""; } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java index 9562adf7e..295eac85f 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRuleTest.java @@ -9,10 +9,16 @@ class PmidCleaningRuleTest { @Test void testCleaning() { + // leading zeros are removed assertEquals("1234", PmidCleaningRule.clean("01234")); + // tolerant to spaces in the middle assertEquals("1234567", PmidCleaningRule.clean("0123 4567")); + // stop parsing at first not numerical char assertEquals("123", PmidCleaningRule.clean("0123x4567")); + // invalid id leading to empty result assertEquals("", PmidCleaningRule.clean("abc")); + // valid id with zeroes in the number + assertEquals("20794075", PmidCleaningRule.clean("20794075")); } }