From 1abcabb6e6240758748f0ba15a0eee9b1dd4efb1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 6 Oct 2020 18:55:23 +0200 Subject: [PATCH] WIP stable ids: IdentifierFactory & unit test --- .../schema/oaf/utils/IdentifierFactory.java | 27 ++------ .../dhp/schema/oaf/utils/PidComparator.java | 62 ++++++++++++++----- .../dnetlib/dhp/schema/oaf/utils/PidType.java | 17 +++++ .../oaf/utils/IdentifierFactoryTest.java | 43 +++++++++++++ .../dhp/schema/oaf/utils/publication_3.json | 1 + .../dhp/schema/oaf/utils/publication_4.json | 1 + .../dhp/schema/oaf/utils/publication_5.json | 1 + .../dhp/schema/oaf/utils/publication_doi.json | 1 + .../dhp/schema/oaf/utils/publication_pmc.json | 1 + .../dhp/schema/oaf/utils/publication_urn.json | 1 + 10 files changed, 116 insertions(+), 39 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidType.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_3.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_4.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_5.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index 02a946154..45e3f84b1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -5,39 +5,20 @@ import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.lang.StringUtils; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.Serializable; -import java.util.HashSet; import java.util.Objects; -import java.util.Set; /** * Factory class for OpenAIRE identifiers in the Graph */ public class IdentifierFactory implements Serializable { - private static final Logger log = LoggerFactory.getLogger(IdentifierFactory.class); - public static final String ID_SEPARATOR = "::"; public static final String ID_PREFIX_SEPARATOR = "|"; public final static String ID_REGEX = "^[0-9][0-9]\\"+ID_PREFIX_SEPARATOR+".{12}"+ID_SEPARATOR+"[a-zA-Z0-9]{32}$"; public static final int ID_PREFIX_LEN = 12; - public static Set acceptedPidTypes = new HashSet<>(); - - static { - acceptedPidTypes.add("doi"); - acceptedPidTypes.add("doi"); - acceptedPidTypes.add("doi"); - acceptedPidTypes.add("doi"); - acceptedPidTypes.add("doi"); - acceptedPidTypes.add("doi"); - - } - public static String createIdentifier(T entity) { if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) { @@ -48,14 +29,14 @@ public class IdentifierFactory implements Serializable { .getPid() .stream() .filter(s -> Objects.nonNull(s.getQualifier())) - .filter(s -> acceptedPidTypes.contains(s.getQualifier().getClassid())) - .max(new PidComparator(entity)) + .filter(s -> PidType.isValid(s.getQualifier().getClassid())) + .min(new PidComparator<>(entity)) .map(s -> idFromPid(entity, s)) .map(IdentifierFactory::verifyIdSyntax) .orElseGet(entity::getId); } - protected static String verifyIdSyntax(String s) { + private static String verifyIdSyntax(String s) { if(StringUtils.isBlank(s) || !s.matches(ID_REGEX)) { throw new RuntimeException(String.format("malformed id: '%s'", s)); } else { @@ -74,7 +55,7 @@ public class IdentifierFactory implements Serializable { } private static String normalizePidValue(String value) { - //TODO more aggressive cleaning? keep only alphanum and punctation? + //TODO more aggressive cleaning? keep only alphanum and punctuation? return value.toLowerCase().replaceAll(" ", ""); } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java index 97bdd9c77..d0a8f87ce 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java @@ -27,8 +27,8 @@ public class PidComparator implements Comparator implements Comparator