From 554551682d0501fa7664fab0a9bee8be6b0309d3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 11 Oct 2023 16:09:19 +0200 Subject: [PATCH] [raw graph] adopting the new COAR based vocabularies for the resource typing --- .../common/vocabulary/VocabularyGroup.java | 21 ++++++++++++ .../dhp/schema/oaf/utils/OafMapperUtils.java | 10 +++++- .../raw/AbstractMdRecordToOafMapper.java | 31 ++++++++++++++++++ .../dhp/oa/graph/raw/OafToOafMapper.java | 32 ++++++++++++++++--- .../dhp/oa/graph/raw/OdfToOafMapper.java | 27 ++++++++++++++++ .../oa/graph/raw/OriginalTypeComparator.java | 32 +++++++++++++++++++ .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 21 ++++++++++++ .../dnetlib/dhp/oa/graph/clean/synonyms.txt | 7 +++- .../eu/dnetlib/dhp/oa/graph/clean/terms.txt | 4 ++- .../dnetlib/dhp/oa/graph/raw/oaf_record.xml | 4 ++- pom.xml | 2 +- 11 files changed, 182 insertions(+), 9 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OriginalTypeComparator.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index fc7175270..4c1feac45 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -135,6 +135,27 @@ public class VocabularyGroup implements Serializable { return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn); } + public Qualifier lookupTermBySynonym(final String vocId, final String syn) { + if (StringUtils.isBlank(vocId)) { + return OafMapperUtils.unknown("", ""); + } + + final Vocabulary vocabulary = vocs.get(vocId.toLowerCase()); + + return Optional + .ofNullable(vocabulary.getTerm(syn)) + .map( + term -> OafMapperUtils + .qualifier(term.getId(), term.getName(), vocabulary.getId(), vocabulary.getName())) + .orElse( + Optional + .ofNullable(vocabulary.getTermBySynonym(syn)) + .map( + term -> OafMapperUtils + .qualifier(term.getId(), term.getName(), vocabulary.getId(), vocabulary.getName())) + .orElse(null)); + } + /** * getSynonymAsQualifierCaseSensitive * diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java index c58096d35..1c557c805 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -14,7 +14,6 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.schema.common.AccessRightComparator; -import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; @@ -141,6 +140,15 @@ public class OafMapperUtils { .collect(Collectors.toList()); } + public static InstanceTypeMapping instanceTypeMapping(String originalType, Qualifier term) { + final InstanceTypeMapping m = new InstanceTypeMapping(); + m.setVocabularyName(term.getSchemeid()); + m.setOriginalType(originalType); + m.setTypeCode(term.getClassid()); + m.setTypeLabel(term.getClassname()); + return m; + } + public static Qualifier unknown(final String schemeid, final String schemename) { return qualifier(UNKNOWN, "Unknown", schemeid, schemename); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index b37e6a755..2fed7d627 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -41,6 +41,11 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/"; + + protected static final String OPENAIRE_COAR_RESOURCE_TYPES_3_1 = "openaire::coar_resource_types_3_1"; + + public static final String OPENAIRE_USER_RESOURCE_TYPES = "openaire::user_resource_types"; + protected static final Qualifier ORCID_PID_TYPE = qualifier( ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, @@ -516,6 +521,32 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + protected abstract String findOriginalType(Document doc); + + protected List prepareInstanceTypeMapping(Document doc) { + return Optional.ofNullable(findOriginalType(doc)) + .map(originalType -> { + final List mappings = Lists.newArrayList(); + + if (vocs.vocabularyExists(OPENAIRE_COAR_RESOURCE_TYPES_3_1)) { + + // TODO verify what the vocabs return when a synonym is not defined + Qualifier coarTerm = vocs.lookupTermBySynonym(OPENAIRE_COAR_RESOURCE_TYPES_3_1, originalType); + mappings.add(OafMapperUtils.instanceTypeMapping(originalType, coarTerm)); + + if (vocs.vocabularyExists(OPENAIRE_USER_RESOURCE_TYPES)) { + + // TODO verify what the vocabs return when a synonym is not defined + Qualifier userTerm = vocs.lookupTermBySynonym(OPENAIRE_USER_RESOURCE_TYPES, coarTerm.getClassid()); + mappings.add(OafMapperUtils.instanceTypeMapping(originalType, userTerm)); + } + } + + return mappings; + }) + .orElse(new ArrayList<>()); + } + private Journal prepareJournal(final Document doc, final DataInfo info) { final Node n = doc.selectSingleNode("//oaf:journal"); if (n != null) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index a9f9367af..5cdb434f1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -5,12 +5,10 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import java.net.URLDecoder; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Element; @@ -25,6 +23,8 @@ import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; +import static org.apache.commons.lang3.StringUtils.contains; + public class OafToOafMapper extends AbstractMdRecordToOafMapper { public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId, @@ -139,6 +139,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List alternateIdentifier = prepareResultPids(doc, info); final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); + instance.setInstanceTypeMapping(prepareInstanceTypeMapping(doc)); + final Set pids = new HashSet<>(pid); instance @@ -187,6 +189,28 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { return Lists.newArrayList(instance); } + /** + * The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value + * So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the + * openaire's info:eu-repo type, and as last resort picks the 1st type text available + * + * http://purl.org/coar/resource_type/c_5794 + * info:eu-repo/semantics/article + * Conference article + * + * @param doc the input document + * @return the chosen resource type + */ + @Override + protected String findOriginalType(Document doc) { + return (String) doc.selectNodes("//dc:type") + .stream() + .map(o -> "" + ((Node) o).getText().trim()) + .sorted(new OriginalTypeComparator()) + .findFirst() + .orElse(null); + } + @Override protected List> prepareSources(final Document doc, final DataInfo info) { return prepareListFields(doc, "//dc:source", info); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index bbd1e7ab1..c01775327 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -9,6 +9,7 @@ import java.net.URLDecoder; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.ObjectUtils; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Element; @@ -139,6 +140,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final List alternateIdentifier = prepareResultPids(doc, info); final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); + instance.setInstanceTypeMapping(prepareInstanceTypeMapping(doc)); + final Set pids = new HashSet<>(pid); instance @@ -217,6 +220,30 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { } } + @Override + protected List prepareInstanceTypeMapping(Document doc) { + return null; + } + + /** + * The Datacite element + * + * journal article + * + * @param doc the input document + * @return the chosen resource type + */ + @Override + protected String findOriginalType(Document doc) { + final Element resourceType = (Element) doc.selectSingleNode( + "//metadata/*[local-name() = 'resource']/*[local-name() = 'resourceType']"); + + final String resourceTypeURI = resourceType.attributeValue("anyURI"); + final String resourceTypeTxt = resourceType.getText(); + + return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt); + } + @Override protected List> prepareSources(final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OriginalTypeComparator.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OriginalTypeComparator.java new file mode 100644 index 000000000..2eeead32e --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OriginalTypeComparator.java @@ -0,0 +1,32 @@ +package eu.dnetlib.dhp.oa.graph.raw; + +import java.util.Comparator; + +import static org.apache.commons.lang3.StringUtils.contains; +import static org.apache.commons.lang3.StringUtils.startsWith; + +public class OriginalTypeComparator implements Comparator { + + @Override + public int compare(String t1, String t2) { + + if (t1.equals(t2)) { + return 0; + } + if (startsWith(t1, "http") && contains(t1, "coar") && contains(t1, "resource_type")) { + return -1; + } + if (startsWith(t2, "http") && contains(t2, "coar") && contains(t2, "resource_type")) { + return 1; + } + if (startsWith(t1, "info:eu-repo/semantics")) { + return -1; + } + if (startsWith(t2, "info:eu-repo/semantics")) { + return 1; + } + + return t1.compareTo(t2); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index b506d3a62..5df2b7a3b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -116,6 +116,27 @@ class MappersTest { assertNotNull(instance.getPid()); assertTrue(instance.getPid().isEmpty()); + assertNotNull(instance.getInstanceTypeMapping()); + assertEquals(2, instance.getInstanceTypeMapping().size()); + + Optional coarType = instance.getInstanceTypeMapping() + .stream() + .filter(itm -> AbstractMdRecordToOafMapper.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(itm.getVocabularyName())) + .findFirst(); + + assertTrue(coarType.isPresent()); + assertEquals("http://purl.org/coar/resource_type/c_5794", coarType.get().getTypeCode()); + assertEquals("conference paper", coarType.get().getTypeLabel()); + + Optional userType = instance.getInstanceTypeMapping() + .stream() + .filter(itm -> AbstractMdRecordToOafMapper.OPENAIRE_USER_RESOURCE_TYPES.equals(itm.getVocabularyName())) + .findFirst(); + + assertTrue(userType.isPresent()); + assertEquals("Article", userType.get().getTypeCode()); + assertEquals("Article", userType.get().getTypeLabel()); + assertFalse(instance.getAlternateIdentifier().isEmpty()); assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid()); assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt index 409dfd5dc..b3fa94b20 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt @@ -1245,4 +1245,9 @@ dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo dnet:relation_subRelType @=@ relationship @=@ publicationDataset dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned FOS @=@ 0101 mathematics @=@ FOS: Mathematics -FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences \ No newline at end of file +FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences +openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Proceedings paper +openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Conference article +openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ http://purl.org/eprint/type/ConferencePaper +openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Conference article +openaire::user_resource_types @=@ Article @=@ http://purl.org/coar/resource_type/c_5794 \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt index 24ce42fc2..a38c0e987 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt @@ -1121,4 +1121,6 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplem dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences -FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences \ No newline at end of file +FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences +openaire::coar_resource_types_3_1 @=@ openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ conference paper +openaire::user_resource_types @=@ openaire::user_resource_types @=@ Article @=@ Article \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index 277578185..492fc9a7a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -47,7 +47,9 @@ provisioning services regulating services supporting services - Research Article + conference paper + http://purl.org/coar/resource_type/c_5794 + info:eu-repo/semantics/article 0001 2017-01-01 diff --git a/pom.xml b/pom.xml index 9cd82a343..0f1ec78af 100644 --- a/pom.xml +++ b/pom.xml @@ -888,7 +888,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [3.17.1] + [4.17.2-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6]