From 2753044d13da0465b8b9061e70252ed6ac69a325 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 11 Jan 2024 16:28:26 +0100 Subject: [PATCH] refined mapping for the extraction of the original resource type --- .../dhp/oa/graph/raw/OafToOafMapper.java | 30 +++++++++++++++++-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 19 ++++++++---- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index a63296d18..eee518353 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -11,6 +11,7 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; +import org.apache.commons.lang3.ObjectUtils; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Element; @@ -27,6 +28,15 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; public class OafToOafMapper extends AbstractMdRecordToOafMapper { + private static Set DC_TYPE_PUBLICATION_VERSION = new HashSet<>(); + + static { + DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/submittedVersion"); + DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/acceptedVersion"); + DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/publishedVersion"); + DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/updatedVersion"); + } + public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId, final boolean forceOrginalId) { super(vocs, invisible, shouldHashId, forceOrginalId); @@ -192,24 +202,40 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { /** * The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value * So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the - * openaire's info:eu-repo type, and as last resort picks the 1st type text available + * openaire's info:eu-repo type, but excluding the following + * + * info:eu-repo/semantics/draft + * info:eu-repo/semantics/submittedVersion + * info:eu-repo/semantics/acceptedVersion + * info:eu-repo/semantics/publishedVersion + * info:eu-repo/semantics/updatedVersion + * + * Then, it picks the 1st dc:type text available and, in case there is no dc:type element, as last resort it tries + * to extract the type from the dr:CobjCategory element + * + * Examples: * * http://purl.org/coar/resource_type/c_5794 * info:eu-repo/semantics/article * Conference article + * 0006 * * @param doc the input document * @return the chosen resource type */ @Override protected String findOriginalType(Document doc) { - return (String) doc + final String dcType = (String) doc .selectNodes("//dc:type") .stream() .map(o -> "" + ((Node) o).getText().trim()) + .filter(t -> !DC_TYPE_PUBLICATION_VERSION.contains(t)) .sorted(new OriginalTypeComparator()) .findFirst() .orElse(null); + + final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()"); + return ObjectUtils.firstNonNull(dcType, drCobjCategory); } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index e63b01a00..08529125c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -221,27 +221,36 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { } /** - * The Datacite element + * Extracts the resource type from The Datacite element * - * journal article + * journal article * * @param doc the input document * @return the chosen resource type */ @Override protected String findOriginalType(Document doc) { - return Optional + final String resourceType = Optional .ofNullable( (Element) doc .selectSingleNode( "//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']")) .map(element -> { - final String resourceTypeURI = element.attributeValue("anyURI"); + final String resourceTypeURI = element.attributeValue("uri"); + final String resourceTypeAnyURI = element.attributeValue("anyURI"); final String resourceTypeTxt = element.getText(); + final String resourceTypeGeneral = element.attributeValue("resourceTypeGeneral"); - return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt); + return ObjectUtils + .firstNonNull(resourceTypeURI, resourceTypeAnyURI, resourceTypeTxt, resourceTypeGeneral); }) .orElse(null); + + final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()"); + return ObjectUtils.firstNonNull(resourceType, drCobjCategory); } @Override