Merge pull request 'Fix cleaning of Pmid where parsing of numbers stopped at first not leading 0 (zero) character' (#345) from fix_truncated_pmid into master

Reviewed-on: D-Net/dnet-hadoop#345
This commit is contained in:
Claudio Atzori 2023-10-06 14:19:49 +02:00
commit ba5475ed4c
2 changed files with 8 additions and 2 deletions

View File

@ -7,7 +7,7 @@ import java.util.regex.Pattern;
// https://researchguides.stevens.edu/c.php?g=442331&p=6577176
public class PmidCleaningRule {
public static final Pattern PATTERN = Pattern.compile("[1-9]{1,8}");
public static final Pattern PATTERN = Pattern.compile("0*(\\d{1,8})");
public static String clean(String pmid) {
String s = pmid
@ -17,7 +17,7 @@ public class PmidCleaningRule {
final Matcher m = PATTERN.matcher(s);
if (m.find()) {
return m.group();
return m.group(1);
}
return "";
}

View File

@ -9,10 +9,16 @@ class PmidCleaningRuleTest {
@Test
void testCleaning() {
// leading zeros are removed
assertEquals("1234", PmidCleaningRule.clean("01234"));
// tolerant to spaces in the middle
assertEquals("1234567", PmidCleaningRule.clean("0123 4567"));
// stop parsing at first not numerical char
assertEquals("123", PmidCleaningRule.clean("0123x4567"));
// invalid id leading to empty result
assertEquals("", PmidCleaningRule.clean("abc"));
// valid id with zeroes in the number
assertEquals("20794075", PmidCleaningRule.clean("20794075"));
}
}