forked from D-Net/dnet-hadoop
[Cleaning] updated cleaning rule for DOIs
This commit is contained in:
parent
f667e94a31
commit
34d653de41
|
@ -15,7 +15,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class CleaningFunctions {
|
public class CleaningFunctions {
|
||||||
|
|
||||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
public static final String DOI_PREFIX_REGEX = "^10\\.";
|
||||||
|
|
||||||
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
||||||
public static final int ORCID_LEN = 19;
|
public static final int ORCID_LEN = 19;
|
||||||
|
@ -308,7 +308,7 @@ public class CleaningFunctions {
|
||||||
|
|
||||||
// TODO add cleaning for more PID types as needed
|
// TODO add cleaning for more PID types as needed
|
||||||
case "doi":
|
case "doi":
|
||||||
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
|
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return pid;
|
return pid;
|
||||||
|
|
Loading…
Reference in New Issue