forked from D-Net/dnet-hadoop
[Cleaning] updated cleaning rule for DOIs
This commit is contained in:
parent
f667e94a31
commit
34d653de41
|
@ -15,7 +15,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
|||
|
||||
public class CleaningFunctions {
|
||||
|
||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
||||
public static final String DOI_PREFIX_REGEX = "^10\\.";
|
||||
|
||||
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
||||
public static final int ORCID_LEN = 19;
|
||||
|
@ -308,7 +308,7 @@ public class CleaningFunctions {
|
|||
|
||||
// TODO add cleaning for more PID types as needed
|
||||
case "doi":
|
||||
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
|
||||
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
|
||||
break;
|
||||
}
|
||||
return pid;
|
||||
|
|
Loading…
Reference in New Issue