From 640b88570694e41415a3006bac5ccbd6eb0f9868 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 16 Mar 2021 14:19:32 +0100 Subject: [PATCH 1/2] added instance.alternativeIdentifiers to the graph model, adjusted the mapping applied to the contents from the aggregator --- .../schema/oaf/utils/IdentifierFactory.java | 74 +++++++++------- .../dhp/schema/common/ModelConstants.java | 2 +- .../eu/dnetlib/dhp/schema/oaf/Instance.java | 11 +++ .../dhp/schema/oaf/StructuredProperty.java | 14 ++- .../raw/AbstractMdRecordToOafMapper.java | 4 +- .../dhp/oa/graph/raw/OafToOafMapper.java | 15 +++- .../dhp/oa/graph/raw/OdfToOafMapper.java | 14 ++- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 87 +++++++++++++++++-- .../dhp/oa/graph/raw/oaf_record_pubmed.xml | 64 ++++++++++++++ 9 files changed, 242 insertions(+), 43 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record_pubmed.xml diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index 40279d29d..a24e40c5f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -10,8 +10,10 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; +import org.jetbrains.annotations.NotNull; import com.google.common.collect.HashBiMap; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.*; @@ -54,6 +56,10 @@ public class IdentifierFactory implements Serializable { PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive"); } + public static List getPids(List pid, KeyValue collectedFrom) { + return pidFromInstance(pid, collectedFrom).collect(Collectors.toList()); + } + /** * Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given * entity T. Returns entity.id when none of the PIDs meet the selection criteria is available. @@ -91,37 +97,7 @@ public class IdentifierFactory implements Serializable { return Optional .ofNullable(((Result) entity).getInstance()) .map( - instance -> instance - .stream() - .map( - i -> Optional - .ofNullable(i.getPid()) - .map( - pp -> pp - .stream() - // filter away PIDs provided by a DS that is not considered an authority for the - // given PID Type - .filter(p -> { - final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); - return Optional.ofNullable(i.getCollectedfrom()).isPresent() && - Optional - .ofNullable(PID_AUTHORITY.get(pType)) - .map(authorities -> { - final KeyValue cf = i.getCollectedfrom(); - return authorities.containsKey(cf.getKey()) - || authorities.containsValue(cf.getValue()); - }) - .orElse(false); - }) - .map(CleaningFunctions::normalizePidValue) - .filter(IdentifierFactory::pidFilter)) - .orElse(Stream.empty())) - .flatMap(Function.identity()) - .collect( - Collectors - .groupingBy( - p -> p.getQualifier().getClassid(), - Collectors.mapping(p -> p, Collectors.toList())))) + instance -> mapPids(instance)) .orElse(new HashMap<>()); } else { return entity @@ -137,6 +113,42 @@ public class IdentifierFactory implements Serializable { } } + private static Map> mapPids(List instance) { + return instance + .stream() + .map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom())) + .flatMap(Function.identity()) + .collect( + Collectors + .groupingBy( + p -> p.getQualifier().getClassid(), + Collectors.mapping(p -> p, Collectors.toList()))); + } + + private static Stream pidFromInstance(List pid, KeyValue collectedFrom) { + return Optional + .ofNullable(pid) + .map( + pp -> pp + .stream() + // filter away PIDs provided by a DS that is not considered an authority for the + // given PID Type + .filter(p -> { + final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); + return Optional.ofNullable(collectedFrom).isPresent() && + Optional + .ofNullable(PID_AUTHORITY.get(pType)) + .map(authorities -> { + return authorities.containsKey(collectedFrom.getKey()) + || authorities.containsValue(collectedFrom.getValue()); + }) + .orElse(false); + }) + .map(CleaningFunctions::normalizePidValue) + .filter(IdentifierFactory::pidFilter)) + .orElse(Stream.empty()); + } + /** * @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)} */ diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index 1a0117fb9..6c4de6342 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -18,7 +18,7 @@ public class ModelConstants { public static final String PUBMED_CENTRAL_ID = "10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357"; public static final String ARXIV_ID = "10|opendoar____::6f4922f45568161a8cdf4ad2299f6d23"; - //VOCABULARY VALUE + // VOCABULARY VALUE public static final String ACCESS_RIGHT_OPEN = "OPEN"; public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies"; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index aae5695df..c4cde0c2a 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -23,6 +23,8 @@ public class Instance implements Serializable { private List pid; + private List alternateIdentifier; + private Field dateofacceptance; // ( article | book ) processing charges. Defined here to cope with possible wrongly typed @@ -107,6 +109,14 @@ public class Instance implements Serializable { this.dateofacceptance = dateofacceptance; } + public List getAlternateIdentifier() { + return alternateIdentifier; + } + + public void setAlternateIdentifier(List alternateIdentifier) { + this.alternateIdentifier = alternateIdentifier; + } + public Field getProcessingchargeamount() { return processingchargeamount; } @@ -159,4 +169,5 @@ public class Instance implements Serializable { return toComparableString().equals(other.toComparableString()); } + } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java index 1fa0de0be..024b915e4 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java @@ -2,6 +2,13 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Joiner; public class StructuredProperty implements Serializable { @@ -36,7 +43,12 @@ public class StructuredProperty implements Serializable { } public String toComparableString() { - return value != null ? value.toLowerCase() : ""; + return Stream + .of( + getQualifier().toComparableString(), + Optional.ofNullable(getValue()).map(String::toLowerCase).orElse("")) + .filter(StringUtils::isNotBlank) + .collect(Collectors.joining("||")); } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index d56971220..bf07de116 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -284,8 +284,8 @@ public abstract class AbstractMdRecordToOafMapper { r.setCollectedfrom(Arrays.asList(collectedFrom)); r.setPid(prepareResultPids(doc, info)); - r.setDateofcollection(doc.valueOf("//dr:dateOfCollection|//dri:dateOfCollection")); - r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation|//dri:dateOfTransformation")); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection/text()|//dri:dateOfCollection/text()")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation/text()|//dri:dateOfTransformation/text()")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setOaiprovenance(prepareOAIprovenance(doc)); r.setAuthor(prepareAuthors(doc, info)); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index ccb3d6caf..22bd6718a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -5,7 +5,9 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -19,6 +21,7 @@ import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; public class OafToOafMapper extends AbstractMdRecordToOafMapper { @@ -125,7 +128,17 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); - instance.setPid(prepareResultPids(doc, info)); + + final List alternateIdentifier = prepareResultPids(doc, info); + final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); + + final Set pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); + + instance + .setAlternateIdentifier( + alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList())); + instance.setPid(pid); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 9d0d2368a..d4997cd2b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*; import java.util.*; +import java.util.function.Function; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -14,6 +15,7 @@ import org.dom4j.Node; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @@ -102,7 +104,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); - instance.setPid(prepareResultPids(doc, info)); + + final List alternateIdentifier = prepareResultPids(doc, info); + final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); + + final Set pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); + + instance + .setAlternateIdentifier( + alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList())); + instance.setPid(pid); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index fb79610c8..b649f8026 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -123,9 +123,11 @@ public class MappersTest { }); assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); assertNotNull(p.getInstance().get(0).getPid()); - assertTrue(p.getInstance().get(0).getPid().size() == 1); - assertEquals("doi", p.getInstance().get(0).getPid().get(0).getQualifier().getClassid()); - assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getPid().get(0).getValue()); + assertTrue(p.getInstance().get(0).getPid().isEmpty()); + + assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty()); + assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); + assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue()); assertNotNull(p.getBestaccessright()); assertEquals("OPEN", p.getBestaccessright().getClassid()); @@ -154,6 +156,78 @@ public class MappersTest { // System.out.println(new ObjectMapper().writeValueAsString(r2)); } + @Test + void testPublication_PubMed() throws IOException { + + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record_pubmed.xml")); + + final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); + + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Publication); + + final Publication p = (Publication) list.get(0); + + assertValidId(p.getId()); + + assertEquals(2, p.getOriginalId().size()); + assertTrue(p.getOriginalId().contains("oai:pubmedcentral.nih.gov:1517292")); + + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertFalse(p.getDataInfo().getInvisible()); + assertTrue(StringUtils.isNotBlank(p.getDateofcollection())); + assertTrue(StringUtils.isNotBlank(p.getDateoftransformation())); + + assertTrue(p.getAuthor().size() > 0); + final Optional author = p + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); + assertTrue(author.isPresent()); + + final StructuredProperty pid = author + .get() + .getPid() + .stream() + .findFirst() + .get(); + assertEquals("0000-0001-6651-1178", pid.getValue()); + assertEquals("ORCID", pid.getQualifier().getClassid()); + assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); + assertEquals("Votsi,Nefta", author.get().getFullname()); + assertEquals("Votsi", author.get().getSurname()); + assertEquals("Nefta", author.get().getName()); + + assertTrue(p.getSubject().size() > 0); + assertTrue(p.getPid().size() > 0); + assertEquals(p.getPid().get(0).getValue(), "PMC1517292"); + assertEquals(p.getPid().get(0).getQualifier().getClassid(), "pmc"); + + assertNotNull(p.getInstance()); + assertTrue(p.getInstance().size() > 0); + p + .getInstance() + .stream() + .forEach(i -> { + assertNotNull(i.getAccessright()); + assertEquals("OPEN", i.getAccessright().getClassid()); + }); + assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid()); + assertNotNull(p.getInstance().get(0).getPid()); + assertTrue(p.getInstance().get(0).getPid().size() == 2); + + assertTrue(p.getInstance().get(0).getAlternateIdentifier().size() == 1); + assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); + assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue()); + + assertNotNull(p.getBestaccessright()); + assertEquals("OPEN", p.getBestaccessright().getClassid()); + } + @Test void testPublicationInvisible() throws IOException { @@ -239,9 +313,10 @@ public class MappersTest { }); assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid()); assertNotNull(d.getInstance().get(0).getPid()); - assertTrue(d.getInstance().get(0).getPid().size() == 1); - assertEquals("doi", d.getInstance().get(0).getPid().get(0).getQualifier().getClassid()); - assertEquals("10.5281/zenodo.3234526", d.getInstance().get(0).getPid().get(0).getValue()); + assertTrue(d.getInstance().get(0).getPid().isEmpty()); + + assertEquals("doi", d.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); + assertEquals("10.5281/zenodo.3234526", d.getInstance().get(0).getAlternateIdentifier().get(0).getValue()); assertValidId(r1.getSource()); assertValidId(r1.getTarget()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record_pubmed.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record_pubmed.xml new file mode 100644 index 000000000..241bfa4ae --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record_pubmed.xml @@ -0,0 +1,64 @@ + + +
+ od_______267::0000072375bc0e68fa09d4e6b7658248 + oai:pubmedcentral.nih.gov:1517292 + + + + + + 2020-08-03T18:38:58Z + 2020-08-03T19:38:58Z + od_______267 +
+ + DEATHS + Nikolaidou,Charitini + Votsi,Nefta + Sgardelis,Steanos + Halley,John + Pantis,John + Tsiafouli,Maria + 1922-07 + + https://europepmc.org/articles/PMC1517292/ + eng + Articles + Text + 0038 + + 1922-07-01 + opendoar____::267 + OPEN + + + PMC1517292 + 18738762 + 10.3897/oneeco.2.e13718 + + + + + https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi + oai:pubmedcentral.nih.gov:1517292 + 2006-08-14 + http://www.openarchives.org/OAI/2.0/oai_dc/ + + + + false + false + 0.9 + + + + +
\ No newline at end of file From 3b2da86f0a6207af58779b7a2619e670b22681ae Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 16 Mar 2021 17:05:38 +0100 Subject: [PATCH 2/2] added precondition on IdentifierFactory to check the presence of entity.id --- .../eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index a24e40c5f..6eb03f9e9 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import static com.google.common.base.Preconditions.checkArgument; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.io.Serializable; @@ -10,10 +11,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; -import org.jetbrains.annotations.NotNull; import com.google.common.collect.HashBiMap; -import com.google.common.collect.Lists; import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.*; @@ -71,6 +70,8 @@ public class IdentifierFactory implements Serializable { */ public static String createIdentifier(T entity, boolean md5) { + checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier"); + final Map> pids = extractPids(entity); return pids