forked from D-Net/dnet-hadoop
Fix cleaning of Pmid where parsing of numbers stopped at first not leading 0' character
This commit is contained in:
parent
6856ab28ab
commit
2f3cf6d0e7
|
@ -7,7 +7,7 @@ import java.util.regex.Pattern;
|
||||||
// https://researchguides.stevens.edu/c.php?g=442331&p=6577176
|
// https://researchguides.stevens.edu/c.php?g=442331&p=6577176
|
||||||
public class PmidCleaningRule {
|
public class PmidCleaningRule {
|
||||||
|
|
||||||
public static final Pattern PATTERN = Pattern.compile("[1-9]{1,8}");
|
public static final Pattern PATTERN = Pattern.compile("0*(\\d{1,8})");
|
||||||
|
|
||||||
public static String clean(String pmid) {
|
public static String clean(String pmid) {
|
||||||
String s = pmid
|
String s = pmid
|
||||||
|
@ -17,7 +17,7 @@ public class PmidCleaningRule {
|
||||||
final Matcher m = PATTERN.matcher(s);
|
final Matcher m = PATTERN.matcher(s);
|
||||||
|
|
||||||
if (m.find()) {
|
if (m.find()) {
|
||||||
return m.group();
|
return m.group(1);
|
||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,10 +9,16 @@ class PmidCleaningRuleTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testCleaning() {
|
void testCleaning() {
|
||||||
|
// leading zeros are removed
|
||||||
assertEquals("1234", PmidCleaningRule.clean("01234"));
|
assertEquals("1234", PmidCleaningRule.clean("01234"));
|
||||||
|
// tolerant to spaces in the middle
|
||||||
assertEquals("1234567", PmidCleaningRule.clean("0123 4567"));
|
assertEquals("1234567", PmidCleaningRule.clean("0123 4567"));
|
||||||
|
// stop parsing at first not numerical char
|
||||||
assertEquals("123", PmidCleaningRule.clean("0123x4567"));
|
assertEquals("123", PmidCleaningRule.clean("0123x4567"));
|
||||||
|
// invalid id leading to empty result
|
||||||
assertEquals("", PmidCleaningRule.clean("abc"));
|
assertEquals("", PmidCleaningRule.clean("abc"));
|
||||||
|
// valid id with zeroes in the number
|
||||||
|
assertEquals("20794075", PmidCleaningRule.clean("20794075"));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue