[raw graph] adopting the new COAR based vocabularies for the resource typing

This commit is contained in:
Claudio Atzori 2023-10-11 16:09:19 +02:00
parent a460ebe215
commit 554551682d
11 changed files with 182 additions and 9 deletions

View File

@ -135,6 +135,27 @@ public class VocabularyGroup implements Serializable {
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
}
public Qualifier lookupTermBySynonym(final String vocId, final String syn) {
if (StringUtils.isBlank(vocId)) {
return OafMapperUtils.unknown("", "");
}
final Vocabulary vocabulary = vocs.get(vocId.toLowerCase());
return Optional
.ofNullable(vocabulary.getTerm(syn))
.map(
term -> OafMapperUtils
.qualifier(term.getId(), term.getName(), vocabulary.getId(), vocabulary.getName()))
.orElse(
Optional
.ofNullable(vocabulary.getTermBySynonym(syn))
.map(
term -> OafMapperUtils
.qualifier(term.getId(), term.getName(), vocabulary.getId(), vocabulary.getName()))
.orElse(null));
}
/**
* getSynonymAsQualifierCaseSensitive
*

View File

@ -14,7 +14,6 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
@ -141,6 +140,15 @@ public class OafMapperUtils {
.collect(Collectors.toList());
}
public static InstanceTypeMapping instanceTypeMapping(String originalType, Qualifier term) {
final InstanceTypeMapping m = new InstanceTypeMapping();
m.setVocabularyName(term.getSchemeid());
m.setOriginalType(originalType);
m.setTypeCode(term.getClassid());
m.setTypeLabel(term.getClassname());
return m;
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
}

View File

@ -41,6 +41,11 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
protected static final String OPENAIRE_COAR_RESOURCE_TYPES_3_1 = "openaire::coar_resource_types_3_1";
public static final String OPENAIRE_USER_RESOURCE_TYPES = "openaire::user_resource_types";
protected static final Qualifier ORCID_PID_TYPE = qualifier(
ModelConstants.ORCID_PENDING,
ModelConstants.ORCID_CLASSNAME,
@ -516,6 +521,32 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
protected abstract String findOriginalType(Document doc);
protected List<InstanceTypeMapping> prepareInstanceTypeMapping(Document doc) {
return Optional.ofNullable(findOriginalType(doc))
.map(originalType -> {
final List<InstanceTypeMapping> mappings = Lists.newArrayList();
if (vocs.vocabularyExists(OPENAIRE_COAR_RESOURCE_TYPES_3_1)) {
// TODO verify what the vocabs return when a synonym is not defined
Qualifier coarTerm = vocs.lookupTermBySynonym(OPENAIRE_COAR_RESOURCE_TYPES_3_1, originalType);
mappings.add(OafMapperUtils.instanceTypeMapping(originalType, coarTerm));
if (vocs.vocabularyExists(OPENAIRE_USER_RESOURCE_TYPES)) {
// TODO verify what the vocabs return when a synonym is not defined
Qualifier userTerm = vocs.lookupTermBySynonym(OPENAIRE_USER_RESOURCE_TYPES, coarTerm.getClassid());
mappings.add(OafMapperUtils.instanceTypeMapping(originalType, userTerm));
}
}
return mappings;
})
.orElse(new ArrayList<>());
}
private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) {

View File

@ -5,12 +5,10 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.Element;
@ -25,6 +23,8 @@ import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import static org.apache.commons.lang3.StringUtils.contains;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
@ -139,6 +139,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
instance.setInstanceTypeMapping(prepareInstanceTypeMapping(doc));
final Set<StructuredProperty> pids = new HashSet<>(pid);
instance
@ -187,6 +189,28 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
return Lists.newArrayList(instance);
}
/**
* The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value
* So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the
* openaire's info:eu-repo type, and as last resort picks the 1st type text available
*
* <dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
* <dc:type>info:eu-repo/semantics/article</dc:type>
* <dc:type>Conference article</dc:type>
*
* @param doc the input document
* @return the chosen resource type
*/
@Override
protected String findOriginalType(Document doc) {
return (String) doc.selectNodes("//dc:type")
.stream()
.map(o -> "" + ((Node) o).getText().trim())
.sorted(new OriginalTypeComparator())
.findFirst()
.orElse(null);
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:source", info);

View File

@ -9,6 +9,7 @@ import java.net.URLDecoder;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.Element;
@ -139,6 +140,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
instance.setInstanceTypeMapping(prepareInstanceTypeMapping(doc));
final Set<StructuredProperty> pids = new HashSet<>(pid);
instance
@ -217,6 +220,30 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
}
}
@Override
protected List<InstanceTypeMapping> prepareInstanceTypeMapping(Document doc) {
return null;
}
/**
* The Datacite element
*
* <datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_6501">journal article</datacite:resourceType>
*
* @param doc the input document
* @return the chosen resource type
*/
@Override
protected String findOriginalType(Document doc) {
final Element resourceType = (Element) doc.selectSingleNode(
"//metadata/*[local-name() = 'resource']/*[local-name() = 'resourceType']");
final String resourceTypeURI = resourceType.attributeValue("anyURI");
final String resourceTypeTxt = resourceType.getText();
return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt);
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???

View File

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.oa.graph.raw;
import java.util.Comparator;
import static org.apache.commons.lang3.StringUtils.contains;
import static org.apache.commons.lang3.StringUtils.startsWith;
public class OriginalTypeComparator implements Comparator<String> {
@Override
public int compare(String t1, String t2) {
if (t1.equals(t2)) {
return 0;
}
if (startsWith(t1, "http") && contains(t1, "coar") && contains(t1, "resource_type")) {
return -1;
}
if (startsWith(t2, "http") && contains(t2, "coar") && contains(t2, "resource_type")) {
return 1;
}
if (startsWith(t1, "info:eu-repo/semantics")) {
return -1;
}
if (startsWith(t2, "info:eu-repo/semantics")) {
return 1;
}
return t1.compareTo(t2);
}
}

View File

@ -116,6 +116,27 @@ class MappersTest {
assertNotNull(instance.getPid());
assertTrue(instance.getPid().isEmpty());
assertNotNull(instance.getInstanceTypeMapping());
assertEquals(2, instance.getInstanceTypeMapping().size());
Optional<InstanceTypeMapping> coarType = instance.getInstanceTypeMapping()
.stream()
.filter(itm -> AbstractMdRecordToOafMapper.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(itm.getVocabularyName()))
.findFirst();
assertTrue(coarType.isPresent());
assertEquals("http://purl.org/coar/resource_type/c_5794", coarType.get().getTypeCode());
assertEquals("conference paper", coarType.get().getTypeLabel());
Optional<InstanceTypeMapping> userType = instance.getInstanceTypeMapping()
.stream()
.filter(itm -> AbstractMdRecordToOafMapper.OPENAIRE_USER_RESOURCE_TYPES.equals(itm.getVocabularyName()))
.findFirst();
assertTrue(userType.isPresent());
assertEquals("Article", userType.get().getTypeCode());
assertEquals("Article", userType.get().getTypeLabel());
assertFalse(instance.getAlternateIdentifier().isEmpty());
assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid());
assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue());

View File

@ -1246,3 +1246,8 @@ dnet:relation_subRelType @=@ relationship @=@ publicationDataset
dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned
FOS @=@ 0101 mathematics @=@ FOS: Mathematics
FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences
openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Proceedings paper
openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Conference article
openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ http://purl.org/eprint/type/ConferencePaper
openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Conference article
openaire::user_resource_types @=@ Article @=@ http://purl.org/coar/resource_type/c_5794

View File

@ -1122,3 +1122,5 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics
FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences
FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences
openaire::coar_resource_types_3_1 @=@ openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ conference paper
openaire::user_resource_types @=@ openaire::user_resource_types @=@ Article @=@ Article

View File

@ -47,7 +47,9 @@
<dc:subject>provisioning services</dc:subject>
<dc:subject>regulating services</dc:subject>
<dc:subject>supporting services</dc:subject>
<dc:type>Research Article</dc:type>
<dc:type>conference paper</dc:type>
<dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
<dc:type>info:eu-repo/semantics/article</dc:type>
<!--<dr:CobjCategory type="publication">0001</dr:CobjCategory>-->
<dr:CobjCategory>0001</dr:CobjCategory>
<oaf:dateAccepted>2017-01-01</oaf:dateAccepted>

View File

@ -888,7 +888,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[3.17.1]</dhp-schemas.version>
<dhp-schemas.version>[4.17.2-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>