From c016cc050ac45729d41573222a8ee4b700b95086 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 23 Nov 2020 19:16:40 +0100 Subject: [PATCH] IdentifierFactory: in case a record provides more than one pid of the same type, the the lexicographically lower value is chosen as best pick --- .../schema/oaf/utils/IdentifierFactory.java | 35 ++++++++++---- .../oaf/utils/OrganizationPidComparator.java | 22 +++++---- .../dhp/schema/oaf/utils/PidComparator.java | 20 +++----- .../schema/oaf/utils/PidValueComparator.java | 35 ++++++++++++++ .../schema/oaf/utils/ResultPidComparator.java | 46 +++++++++++-------- .../oaf/utils/IdentifierFactoryTest.java | 7 +-- ...ication_doi.json => publication_doi1.json} | 0 .../schema/oaf/utils/publication_doi2.json | 1 + ...ication_pmc.json => publication_pmc1.json} | 0 ...ication_urn.json => publication_urn1.json} | 0 10 files changed, 114 insertions(+), 52 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java rename dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/{publication_doi.json => publication_doi1.json} (100%) create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi2.json rename dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/{publication_pmc.json => publication_pmc1.json} (100%) rename dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/{publication_urn.json => publication_urn1.json} (100%) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index 28e4accca..a7310e8de 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -2,7 +2,12 @@ package eu.dnetlib.dhp.schema.oaf.utils; import java.io.Serializable; +import java.util.List; +import java.util.Map; import java.util.Objects; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.lang.StringUtils; @@ -42,14 +47,28 @@ public class IdentifierFactory implements Serializable { return entity.getId(); } - return entity - .getPid() - .stream() - .filter(s -> pidFilter(s)) - .min(new PidComparator<>(entity)) - .map(s -> idFromPid(entity, s)) - .map(IdentifierFactory::verifyIdSyntax) - .orElseGet(entity::getId); + Map> pids = entity + .getPid() + .stream() + .filter(s -> pidFilter(s)) + .collect( + Collectors.groupingBy(p -> p.getQualifier().getClassid(), + Collectors.mapping(p -> p, Collectors.toList())) + ); + + return pids + .values() + .stream() + .flatMap(s -> s.stream()) + .min(new PidComparator<>(entity)) + .map(min -> Optional.ofNullable(pids.get(min.getQualifier().getClassid())) + .map(p -> p.stream() + .sorted(new PidValueComparator()) + .findFirst() + .map(s -> idFromPid(entity, s)) + .orElseGet(entity::getId)) + .orElseGet(entity::getId)) + .orElseGet(entity::getId); } protected static boolean pidFilter(StructuredProperty s) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java index 733d41bff..a5e1b34d7 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java @@ -1,25 +1,31 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + import java.util.Comparator; -public class OrganizationPidComparator implements Comparator { +public class OrganizationPidComparator implements Comparator { @Override - public int compare(PidType pLeft, PidType pRight) { - if (pLeft.equals(PidType.GRID)) + public int compare(StructuredProperty left, StructuredProperty right) { + + PidType lClass = PidType.valueOf(left.getQualifier().getClassid()); + PidType rClass = PidType.valueOf(right.getQualifier().getClassid()); + + if (lClass.equals(PidType.GRID)) return -1; - if (pRight.equals(PidType.GRID)) + if (rClass.equals(PidType.GRID)) return 1; - if (pLeft.equals(PidType.mag_id)) + if (lClass.equals(PidType.mag_id)) return -1; - if (pRight.equals(PidType.mag_id)) + if (rClass.equals(PidType.mag_id)) return 1; - if (pLeft.equals(PidType.urn)) + if (lClass.equals(PidType.urn)) return -1; - if (pRight.equals(PidType.urn)) + if (rClass.equals(PidType.urn)) return 1; return 0; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java index 34ce5563f..2bee0eb56 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java @@ -27,28 +27,22 @@ public class PidComparator implements Comparator { + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + StructuredProperty l = CleaningFunctions.normalizePidValue(left); + StructuredProperty r = CleaningFunctions.normalizePidValue(right); + + return Optional.ofNullable(l.getValue()) + .map(lv -> Optional.ofNullable(r.getValue()) + .map(rv -> lv.compareTo(rv)) + .orElse(-1)) + .orElse(1); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java index 0f65cca36..0a733495d 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java @@ -1,55 +1,61 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + import java.util.Comparator; -public class ResultPidComparator implements Comparator { +public class ResultPidComparator implements Comparator { @Override - public int compare(PidType pLeft, PidType pRight) { - if (pLeft.equals(PidType.doi)) + public int compare(StructuredProperty left, StructuredProperty right) { + + PidType lClass = PidType.valueOf(left.getQualifier().getClassid()); + PidType rClass = PidType.valueOf(right.getQualifier().getClassid()); + + if (lClass.equals(PidType.doi)) return -1; - if (pRight.equals(PidType.doi)) + if (rClass.equals(PidType.doi)) return 1; - if (pLeft.equals(PidType.pmid)) + if (lClass.equals(PidType.pmid)) return -1; - if (pRight.equals(PidType.pmid)) + if (rClass.equals(PidType.pmid)) return 1; - if (pLeft.equals(PidType.pmc)) + if (lClass.equals(PidType.pmc)) return -1; - if (pRight.equals(PidType.pmc)) + if (rClass.equals(PidType.pmc)) return 1; - if (pLeft.equals(PidType.handle)) + if (lClass.equals(PidType.handle)) return -1; - if (pRight.equals(PidType.handle)) + if (rClass.equals(PidType.handle)) return 1; - if (pLeft.equals(PidType.arXiv)) + if (lClass.equals(PidType.arXiv)) return -1; - if (pRight.equals(PidType.arXiv)) + if (rClass.equals(PidType.arXiv)) return 1; - if (pLeft.equals(PidType.NCID)) + if (lClass.equals(PidType.NCID)) return -1; - if (pRight.equals(PidType.NCID)) + if (rClass.equals(PidType.NCID)) return 1; - if (pLeft.equals(PidType.GBIF)) + if (lClass.equals(PidType.GBIF)) return -1; - if (pRight.equals(PidType.GBIF)) + if (rClass.equals(PidType.GBIF)) return 1; - if (pLeft.equals(PidType.nct)) + if (lClass.equals(PidType.nct)) return -1; - if (pRight.equals(PidType.nct)) + if (rClass.equals(PidType.nct)) return 1; - if (pLeft.equals(PidType.urn)) + if (lClass.equals(PidType.urn)) return -1; - if (pRight.equals(PidType.urn)) + if (rClass.equals(PidType.urn)) return 1; return 0; diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java index d458c613e..2b34a46ca 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java @@ -22,10 +22,11 @@ public class IdentifierFactoryTest { @Test public void testCreateIdentifierForPublication() throws IOException { - verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013")); - verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329")); + verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013")); + verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013")); + verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329")); verifyIdentifier( - "publication_urn.json", + "publication_urn1.json", "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2")); final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi1.json similarity index 100% rename from dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi.json rename to dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi1.json diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi2.json new file mode 100644 index 000000000..7cc27f440 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi2.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2010.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc1.json similarity index 100% rename from dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc.json rename to dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc1.json diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn1.json similarity index 100% rename from dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn.json rename to dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn1.json