forked from D-Net/dnet-hadoop
Merge pull request '[graph cleaning] fixed regex behaviour for cleaning ROR and GRID identifiers, added tests' (#315) from pid_cleaning into beta
Reviewed-on: D-Net/dnet-hadoop#315
This commit is contained in:
commit
002b24e06f
|
@ -6,14 +6,16 @@ import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class FundRefCleaningRule {
|
public class FundRefCleaningRule {
|
||||||
|
|
||||||
public static String clean(final String fundrefId) {
|
public static final Pattern PATTERN = Pattern.compile("\\d+");
|
||||||
|
|
||||||
String s = fundrefId
|
public static String clean(final String fundRefId) {
|
||||||
|
|
||||||
|
String s = fundRefId
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
.replaceAll("\\s", "");
|
.replaceAll("\\s", "");
|
||||||
|
|
||||||
Matcher m = Pattern.compile("\\d+").matcher(s);
|
Matcher m = PATTERN.matcher(s);
|
||||||
if (m.matches()) {
|
if (m.find()) {
|
||||||
return m.group();
|
return m.group();
|
||||||
} else {
|
} else {
|
||||||
return "";
|
return "";
|
||||||
|
|
|
@ -6,13 +6,19 @@ import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class GridCleaningRule {
|
public class GridCleaningRule {
|
||||||
|
|
||||||
|
public static final Pattern PATTERN = Pattern.compile("(?<grid>\\d{4,6}\\.[0-9a-z]{1,2})");
|
||||||
|
|
||||||
public static String clean(String grid) {
|
public static String clean(String grid) {
|
||||||
String s = grid
|
String s = grid
|
||||||
.replaceAll("\\s", "")
|
.replaceAll("\\s", "")
|
||||||
.toLowerCase();
|
.toLowerCase();
|
||||||
|
|
||||||
Matcher m = Pattern.compile("\\d{4,6}\\.[0-9a-z]{1,2}").matcher(s);
|
Matcher m = PATTERN.matcher(s);
|
||||||
return m.matches() ? "grid." + m.group() : "";
|
if (m.find()) {
|
||||||
|
return "grid." + m.group("grid");
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,10 +7,12 @@ import java.util.regex.Pattern;
|
||||||
// https://www.wikidata.org/wiki/Property:P213
|
// https://www.wikidata.org/wiki/Property:P213
|
||||||
public class ISNICleaningRule {
|
public class ISNICleaningRule {
|
||||||
|
|
||||||
|
public static final Pattern PATTERN = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])");
|
||||||
|
|
||||||
public static String clean(final String isni) {
|
public static String clean(final String isni) {
|
||||||
|
|
||||||
Matcher m = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])").matcher(isni);
|
Matcher m = PATTERN.matcher(isni);
|
||||||
if (m.matches()) {
|
if (m.find()) {
|
||||||
return String.join("", m.group(1), m.group(2), m.group(3), m.group(4));
|
return String.join("", m.group(1), m.group(2), m.group(3), m.group(4));
|
||||||
} else {
|
} else {
|
||||||
return "";
|
return "";
|
||||||
|
|
|
@ -6,10 +6,12 @@ import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class PICCleaningRule {
|
public class PICCleaningRule {
|
||||||
|
|
||||||
|
public static final Pattern PATTERN = Pattern.compile("\\d{9}");
|
||||||
|
|
||||||
public static String clean(final String pic) {
|
public static String clean(final String pic) {
|
||||||
|
|
||||||
Matcher m = Pattern.compile("\\d{9}").matcher(pic);
|
Matcher m = PATTERN.matcher(pic);
|
||||||
if (m.matches()) {
|
if (m.find()) {
|
||||||
return m.group();
|
return m.group();
|
||||||
} else {
|
} else {
|
||||||
return "";
|
return "";
|
||||||
|
|
|
@ -1,13 +1,24 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class PmcCleaningRule {
|
public class PmcCleaningRule {
|
||||||
|
|
||||||
|
public static final Pattern PATTERN = Pattern.compile("PMC\\d{1,8}");
|
||||||
|
|
||||||
public static String clean(String pmc) {
|
public static String clean(String pmc) {
|
||||||
String s = pmc
|
String s = pmc
|
||||||
.replaceAll("\\s", "")
|
.replaceAll("\\s", "")
|
||||||
.toUpperCase();
|
.toUpperCase();
|
||||||
return s.matches("^PMC\\d{1,8}$") ? s : "";
|
|
||||||
|
final Matcher m = PATTERN.matcher(s);
|
||||||
|
|
||||||
|
if (m.find()) {
|
||||||
|
return m.group();
|
||||||
|
}
|
||||||
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,16 +1,25 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
// https://researchguides.stevens.edu/c.php?g=442331&p=6577176
|
// https://researchguides.stevens.edu/c.php?g=442331&p=6577176
|
||||||
public class PmidCleaningRule {
|
public class PmidCleaningRule {
|
||||||
|
|
||||||
|
public static final Pattern PATTERN = Pattern.compile("[1-9]{1,8}");
|
||||||
|
|
||||||
public static String clean(String pmid) {
|
public static String clean(String pmid) {
|
||||||
String s = pmid
|
String s = pmid
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
.replaceAll("\\s", "")
|
.replaceAll("\\s", "");
|
||||||
.trim()
|
|
||||||
.replaceAll("^0+", "");
|
final Matcher m = PATTERN.matcher(s);
|
||||||
return s.matches("^\\d{1,8}$") ? s : "";
|
|
||||||
|
if (m.find()) {
|
||||||
|
return m.group();
|
||||||
|
}
|
||||||
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,12 +7,21 @@ import java.util.regex.Pattern;
|
||||||
// https://ror.readme.io/docs/ror-identifier-pattern
|
// https://ror.readme.io/docs/ror-identifier-pattern
|
||||||
public class RorCleaningRule {
|
public class RorCleaningRule {
|
||||||
|
|
||||||
|
public static final String ROR_PREFIX = "https://ror.org/";
|
||||||
|
|
||||||
|
private static final Pattern PATTERN = Pattern.compile("(?<ror>0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2})");
|
||||||
|
|
||||||
public static String clean(String ror) {
|
public static String clean(String ror) {
|
||||||
String s = ror
|
String s = ror
|
||||||
.replaceAll("\\s", "")
|
.replaceAll("\\s", "")
|
||||||
.toLowerCase();
|
.toLowerCase();
|
||||||
Matcher m = Pattern.compile("0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}").matcher(s);
|
|
||||||
return m.matches() ? "https://ror.org/" + m.group() : "";
|
Matcher m = PATTERN.matcher(s);
|
||||||
|
|
||||||
|
if (m.find()) {
|
||||||
|
return ROR_PREFIX + m.group("ror");
|
||||||
|
}
|
||||||
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class GridCleaningRuleTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleaning() {
|
||||||
|
assertEquals("grid.493784.5", GridCleaningRule.clean("grid.493784.5"));
|
||||||
|
assertEquals("grid.493784.5x", GridCleaningRule.clean("grid.493784.5x"));
|
||||||
|
assertEquals("grid.493784.5x", GridCleaningRule.clean("493784.5x"));
|
||||||
|
assertEquals("", GridCleaningRule.clean("493x784.5x"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class ISNICleaningRuleTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleaning() {
|
||||||
|
assertEquals("0000000463436020", ISNICleaningRule.clean("0000 0004 6343 6020"));
|
||||||
|
assertEquals("0000000463436020", ISNICleaningRule.clean("0000000463436020"));
|
||||||
|
assertEquals("", ISNICleaningRule.clean("Q30256598"));
|
||||||
|
assertEquals("0000000493403529", ISNICleaningRule.clean("ISNI:0000000493403529"));
|
||||||
|
assertEquals("000000008614884X", ISNICleaningRule.clean("0000 0000 8614 884X"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class PICCleaningRuleTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleaning() {
|
||||||
|
assertEquals("887624982", PICCleaningRule.clean("887624982"));
|
||||||
|
assertEquals("", PICCleaningRule.clean("887 624982"));
|
||||||
|
assertEquals("887624982", PICCleaningRule.clean(" 887624982 "));
|
||||||
|
assertEquals("887624982", PICCleaningRule.clean(" 887624982x "));
|
||||||
|
assertEquals("887624982", PICCleaningRule.clean(" 88762498200 "));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class PmcCleaningRuleTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleaning() {
|
||||||
|
assertEquals("PMC1234", PmcCleaningRule.clean("PMC1234"));
|
||||||
|
assertEquals("PMC1234", PmcCleaningRule.clean(" PMC1234"));
|
||||||
|
assertEquals("PMC12345678", PmcCleaningRule.clean("PMC12345678"));
|
||||||
|
assertEquals("PMC12345678", PmcCleaningRule.clean("PMC123456789"));
|
||||||
|
assertEquals("PMC12345678", PmcCleaningRule.clean("PMC 12345678"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class PmidCleaningRuleTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleaning() {
|
||||||
|
assertEquals("1234", PmidCleaningRule.clean("01234"));
|
||||||
|
assertEquals("1234567", PmidCleaningRule.clean("0123 4567"));
|
||||||
|
assertEquals("123", PmidCleaningRule.clean("0123x4567"));
|
||||||
|
assertEquals("", PmidCleaningRule.clean("abc"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class RorCleaningRuleTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCleaning() {
|
||||||
|
assertEquals("https://ror.org/05rpz9w55", RorCleaningRule.clean("https://ror.org/05rpz9w55"));
|
||||||
|
assertEquals("https://ror.org/05rpz9w55", RorCleaningRule.clean("05rpz9w55"));
|
||||||
|
assertEquals("", RorCleaningRule.clean("05rpz9w_55"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue