From 6ce340bd3d8ba3552a1640201d7003afd66c8141 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 6 Oct 2020 15:44:53 +0200 Subject: [PATCH] WIP stable ids: IdentifierFactory --- dhp-common/pom.xml | 6 ++ .../schema/oaf/utils/IdentifierFactory.java | 90 +++++++++++++++++++ .../dhp/schema/oaf/utils/PidComparator.java | 84 +++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 1dc3208b5..6e7ee527b 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -29,6 +29,12 @@ spark-sql_2.11 + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + commons-cli commons-cli diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java new file mode 100644 index 000000000..02a946154 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -0,0 +1,90 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.lang.StringUtils; +import org.jetbrains.annotations.NotNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.HashSet; +import java.util.Objects; +import java.util.Set; + +/** + * Factory class for OpenAIRE identifiers in the Graph + */ +public class IdentifierFactory implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(IdentifierFactory.class); + + public static final String ID_SEPARATOR = "::"; + public static final String ID_PREFIX_SEPARATOR = "|"; + public final static String ID_REGEX = "^[0-9][0-9]\\"+ID_PREFIX_SEPARATOR+".{12}"+ID_SEPARATOR+"[a-zA-Z0-9]{32}$"; + public static final int ID_PREFIX_LEN = 12; + + public static Set acceptedPidTypes = new HashSet<>(); + + static { + acceptedPidTypes.add("doi"); + acceptedPidTypes.add("doi"); + acceptedPidTypes.add("doi"); + acceptedPidTypes.add("doi"); + acceptedPidTypes.add("doi"); + acceptedPidTypes.add("doi"); + + } + + public static String createIdentifier(T entity) { + + if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) { + return entity.getId(); + } + + return entity + .getPid() + .stream() + .filter(s -> Objects.nonNull(s.getQualifier())) + .filter(s -> acceptedPidTypes.contains(s.getQualifier().getClassid())) + .max(new PidComparator(entity)) + .map(s -> idFromPid(entity, s)) + .map(IdentifierFactory::verifyIdSyntax) + .orElseGet(entity::getId); + } + + protected static String verifyIdSyntax(String s) { + if(StringUtils.isBlank(s) || !s.matches(ID_REGEX)) { + throw new RuntimeException(String.format("malformed id: '%s'", s)); + } else { + return s; + } + } + + private static String idFromPid(T entity, StructuredProperty s) { + return new StringBuilder() + .append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR)) + .append(ID_PREFIX_SEPARATOR) + .append(createPrefix(s.getQualifier().getClassid())) + .append(ID_SEPARATOR) + .append(DHPUtils.md5(normalizePidValue(s.getValue()))) + .toString(); + } + + private static String normalizePidValue(String value) { + //TODO more aggressive cleaning? keep only alphanum and punctation? + return value.toLowerCase().replaceAll(" ", ""); + } + + // create the prefix (length = 12) + private static String createPrefix(String pidType) { + StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN)); + while (prefix.length() < ID_PREFIX_LEN) { + prefix.append("_"); + } + return prefix.substring(0, ID_PREFIX_LEN); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java new file mode 100644 index 000000000..97bdd9c77 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class PidComparator implements Comparator { + + private T entity; + + public PidComparator(T entity) { + this.entity = entity; + } + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + String lClass = left.getQualifier().getClassid(); + String rClass = right.getQualifier().getClassid(); + + if (lClass.equals(rClass)) + return 0; + + if (ModelSupport.isSubClass(entity, Result.class)) { + return compareResultPids(lClass, rClass); + } + if (ModelSupport.isSubClass(entity, Organization.class)) { + return compareOrganizationtPids(lClass, rClass); + } + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } + + private int compareResultPids(String lClass, String rClass) { + if (lClass.equals("doi")) + return -1; + if (rClass.equals("doi")) + return 1; + + if (lClass.equals("pmid")) + return -1; + if (rClass.equals("pmid")) + return 1; + + if (lClass.equals("pmc")) + return -1; + if (rClass.equals("pmc")) + return 1; + + return 0; + } + + private int compareOrganizationtPids(String lClass, String rClass) { + if (lClass.equals("GRID")) + return -1; + if (rClass.equals("GRID")) + return 1; + + if (lClass.equals("mag_id")) + return -1; + if (rClass.equals("mag_id")) + return 1; + + if (lClass.equals("urn")) + return -1; + if (rClass.equals("urn")) + return 1; + + return 0; + } +}