[Cleaning] updated cleaning rule for DOIs

This commit is contained in:
Claudio Atzori 2021-01-22 14:16:33 +01:00
parent f667e94a31
commit 34d653de41
1 changed files with 2 additions and 2 deletions

View File

@ -15,7 +15,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String DOI_PREFIX_REGEX = "^10\\.";
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
public static final int ORCID_LEN = 19;
@ -308,7 +308,7 @@ public class CleaningFunctions {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
break;
}
return pid;