IdentifierFactory considers only DOIs matching a given regex

This commit is contained in:
Claudio Atzori 2020-11-03 18:43:37 +01:00
parent 86d6fbe95b
commit ea2a0ea949
2 changed files with 24 additions and 8 deletions

View File

@ -12,12 +12,10 @@ import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists; import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class CleaningFunctions { public class CleaningFunctions {
public static final String DOI_URL_PREFIX = "^http(s?):\\/\\/(dx\\.)?doi\\.org\\/"; public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NONE = "none"; public static final String NONE = "none";
@ -231,7 +229,7 @@ public class CleaningFunctions {
// TODO add cleaning for more PID types as needed // TODO add cleaning for more PID types as needed
case "doi": case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX, "")); pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
break; break;
} }
return pid; return pid;

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.Serializable; import java.io.Serializable;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -21,6 +22,12 @@ public class IdentifierFactory implements Serializable {
public static final String ID_PREFIX_SEPARATOR = "|"; public static final String ID_PREFIX_SEPARATOR = "|";
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
+ "[a-zA-Z0-9]{32}$"; + "[a-zA-Z0-9]{32}$";
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
"(^10\\.1002\\/[^\\s]+$)|" +
"(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" +
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
public static final int ID_PREFIX_LEN = 12; public static final int ID_PREFIX_LEN = 12;
public static final String NONE = "none"; public static final String NONE = "none";
@ -48,10 +55,21 @@ public class IdentifierFactory implements Serializable {
} }
protected static boolean pidFilter(StructuredProperty s) { protected static boolean pidFilter(StructuredProperty s) {
return Objects.nonNull(s.getQualifier()) && if (Objects.isNull(s.getQualifier()) ||
PidType.isValid(s.getQualifier().getClassid()) && StringUtils.isBlank(StringUtils.trim(s.getValue()))) {
StringUtils.isNotBlank(StringUtils.trim(s.getValue())) && return false;
!NONE.equals(StringUtils.trim(StringUtils.lowerCase(s.getValue()))); }
try {
switch (PidType.valueOf(s.getQualifier().getClassid())) {
case doi:
final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue()));
return doi.matches(DOI_REGEX);
default:
return true;
}
} catch (IllegalArgumentException e) {
return false;
}
} }
private static String verifyIdSyntax(String s) { private static String verifyIdSyntax(String s) {