forked from D-Net/dnet-hadoop
Merge pull request 'Fix cleaning of Pmid where parsing of numbers stopped at first not leading 0 (zero) character' (#345) from fix_truncated_pmid into master
Reviewed-on: D-Net/dnet-hadoop#345
This commit is contained in:
commit
ba5475ed4c
|
@ -7,7 +7,7 @@ import java.util.regex.Pattern;
|
|||
// https://researchguides.stevens.edu/c.php?g=442331&p=6577176
|
||||
public class PmidCleaningRule {
|
||||
|
||||
public static final Pattern PATTERN = Pattern.compile("[1-9]{1,8}");
|
||||
public static final Pattern PATTERN = Pattern.compile("0*(\\d{1,8})");
|
||||
|
||||
public static String clean(String pmid) {
|
||||
String s = pmid
|
||||
|
@ -17,7 +17,7 @@ public class PmidCleaningRule {
|
|||
final Matcher m = PATTERN.matcher(s);
|
||||
|
||||
if (m.find()) {
|
||||
return m.group();
|
||||
return m.group(1);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
|
|
@ -9,10 +9,16 @@ class PmidCleaningRuleTest {
|
|||
|
||||
@Test
|
||||
void testCleaning() {
|
||||
// leading zeros are removed
|
||||
assertEquals("1234", PmidCleaningRule.clean("01234"));
|
||||
// tolerant to spaces in the middle
|
||||
assertEquals("1234567", PmidCleaningRule.clean("0123 4567"));
|
||||
// stop parsing at first not numerical char
|
||||
assertEquals("123", PmidCleaningRule.clean("0123x4567"));
|
||||
// invalid id leading to empty result
|
||||
assertEquals("", PmidCleaningRule.clean("abc"));
|
||||
// valid id with zeroes in the number
|
||||
assertEquals("20794075", PmidCleaningRule.clean("20794075"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue