From 491ad2475052dec98c329906352413d0d0e6001d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 9 Dec 2020 09:10:33 +0100 Subject: [PATCH] introduced filtering for DOIs in graph cleaning workflow --- .../dhp/schema/oaf/CleaningFunctions.java | 28 +++++++++++++++++-- .../schema/oaf/utils/IdentifierFactory.java | 18 ++++++------ 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 5f191e9a9..8ce4285d6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -13,7 +13,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants; public class CleaningFunctions { - public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; + public static final String DOI_PREFIX_REGEX = "^.*10\\."; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; @@ -146,6 +146,7 @@ public class CleaningFunctions { .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .map(CleaningFunctions::normalizePidValue) + .filter(CleaningFunctions::filterPid) .collect(Collectors.toList())); } if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { @@ -253,6 +254,29 @@ public class CleaningFunctions { classid, classname, scheme, scheme); } + /** + * Utility method that filter PID values on a per-type basis. + * @param pid the PID whose value will be checked. + * @return true the PID containing the normalised value. + */ + private static boolean filterPid(StructuredProperty pid) { + String value = Optional + .ofNullable(pid.getValue()) + .map(s -> StringUtils.replaceAll(s, "\\s", "")) + .orElse(""); + if (StringUtils.isBlank(value)) { + return false; + } + switch (pid.getQualifier().getClassid()) { + + // TODO add cleaning for more PID types as needed + case "doi": + return value.startsWith("10."); + default: + return true; + } + } + /** * Utility method that normalises PID values on a per-type basis. * @param pid the PID whose value will be normalised. @@ -267,7 +291,7 @@ public class CleaningFunctions { // TODO add cleaning for more PID types as needed case "doi": - pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, "")); + pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10.")); break; } return pid; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index a6b2ce29b..9978194ac 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -1,18 +1,18 @@ package eu.dnetlib.dhp.schema.oaf.utils; -import java.io.IOException; -import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; - import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.lang3.StringUtils; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.Collectors; /** * Factory class for OpenAIRE identifiers in the Graph @@ -21,8 +21,6 @@ public class IdentifierFactory implements Serializable { public static final String ID_SEPARATOR = "::"; public static final String ID_PREFIX_SEPARATOR = "|"; - public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR - + "[a-zA-Z0-9]{32}$"; public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" + "(^10\\.1002\\/[^\\s]+$)|" +