From 6cb0dc3f433d385739cac12049b44a9d128b220c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 21 Dec 2020 11:40:17 +0100 Subject: [PATCH] extended OCRID cleaning procedure --- .../dhp/oa/graph/clean/CleaningFunctions.java | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index fbb20f1433..798eae4ddd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -16,7 +16,7 @@ import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; - public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; + public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final Set PID_BLACKLIST = new HashSet<>(); @@ -211,14 +211,31 @@ public class CleaningFunctions { .map(Qualifier::getClassid) .orElse("")) .orElse(""); - if (pidProvenance.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) { - p.getQualifier().setClassid(ModelConstants.ORCID); - } else { - p.getQualifier().setClassid(ModelConstants.ORCID_PENDING); + if (p + .getQualifier() + .getClassid() + .toLowerCase() + .contains(ModelConstants.ORCID)) { + if (pidProvenance + .equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) { + p.getQualifier().setClassid(ModelConstants.ORCID); + } else { + p.getQualifier().setClassid(ModelConstants.ORCID_PENDING); + } + final String orcid = p + .getValue() + .trim() + .toLowerCase() + .replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4"); + if (orcid.length() == 19) { + p.setValue(orcid); + } else { + p.setValue(""); + } } - p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); return p; }) + .filter(p -> StringUtils.isNotBlank(p.getValue())) .collect( Collectors .toMap(