forked from D-Net/dnet-hadoop
[raw graph] adopting the new COAR based vocabularies for the resource typing
This commit is contained in:
parent
a460ebe215
commit
554551682d
|
@ -135,6 +135,27 @@ public class VocabularyGroup implements Serializable {
|
|||
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
|
||||
}
|
||||
|
||||
public Qualifier lookupTermBySynonym(final String vocId, final String syn) {
|
||||
if (StringUtils.isBlank(vocId)) {
|
||||
return OafMapperUtils.unknown("", "");
|
||||
}
|
||||
|
||||
final Vocabulary vocabulary = vocs.get(vocId.toLowerCase());
|
||||
|
||||
return Optional
|
||||
.ofNullable(vocabulary.getTerm(syn))
|
||||
.map(
|
||||
term -> OafMapperUtils
|
||||
.qualifier(term.getId(), term.getName(), vocabulary.getId(), vocabulary.getName()))
|
||||
.orElse(
|
||||
Optional
|
||||
.ofNullable(vocabulary.getTermBySynonym(syn))
|
||||
.map(
|
||||
term -> OafMapperUtils
|
||||
.qualifier(term.getId(), term.getName(), vocabulary.getId(), vocabulary.getName()))
|
||||
.orElse(null));
|
||||
}
|
||||
|
||||
/**
|
||||
* getSynonymAsQualifierCaseSensitive
|
||||
*
|
||||
|
|
|
@ -14,7 +14,6 @@ import java.util.stream.Collectors;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
|
@ -141,6 +140,15 @@ public class OafMapperUtils {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static InstanceTypeMapping instanceTypeMapping(String originalType, Qualifier term) {
|
||||
final InstanceTypeMapping m = new InstanceTypeMapping();
|
||||
m.setVocabularyName(term.getSchemeid());
|
||||
m.setOriginalType(originalType);
|
||||
m.setTypeCode(term.getClassid());
|
||||
m.setTypeLabel(term.getClassname());
|
||||
return m;
|
||||
}
|
||||
|
||||
public static Qualifier unknown(final String schemeid, final String schemename) {
|
||||
return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
|
||||
}
|
||||
|
|
|
@ -41,6 +41,11 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
|
||||
|
||||
protected static final String OPENAIRE_COAR_RESOURCE_TYPES_3_1 = "openaire::coar_resource_types_3_1";
|
||||
|
||||
public static final String OPENAIRE_USER_RESOURCE_TYPES = "openaire::user_resource_types";
|
||||
|
||||
protected static final Qualifier ORCID_PID_TYPE = qualifier(
|
||||
ModelConstants.ORCID_PENDING,
|
||||
ModelConstants.ORCID_CLASSNAME,
|
||||
|
@ -516,6 +521,32 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||
|
||||
protected abstract String findOriginalType(Document doc);
|
||||
|
||||
protected List<InstanceTypeMapping> prepareInstanceTypeMapping(Document doc) {
|
||||
return Optional.ofNullable(findOriginalType(doc))
|
||||
.map(originalType -> {
|
||||
final List<InstanceTypeMapping> mappings = Lists.newArrayList();
|
||||
|
||||
if (vocs.vocabularyExists(OPENAIRE_COAR_RESOURCE_TYPES_3_1)) {
|
||||
|
||||
// TODO verify what the vocabs return when a synonym is not defined
|
||||
Qualifier coarTerm = vocs.lookupTermBySynonym(OPENAIRE_COAR_RESOURCE_TYPES_3_1, originalType);
|
||||
mappings.add(OafMapperUtils.instanceTypeMapping(originalType, coarTerm));
|
||||
|
||||
if (vocs.vocabularyExists(OPENAIRE_USER_RESOURCE_TYPES)) {
|
||||
|
||||
// TODO verify what the vocabs return when a synonym is not defined
|
||||
Qualifier userTerm = vocs.lookupTermBySynonym(OPENAIRE_USER_RESOURCE_TYPES, coarTerm.getClassid());
|
||||
mappings.add(OafMapperUtils.instanceTypeMapping(originalType, userTerm));
|
||||
}
|
||||
}
|
||||
|
||||
return mappings;
|
||||
})
|
||||
.orElse(new ArrayList<>());
|
||||
}
|
||||
|
||||
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
||||
final Node n = doc.selectSingleNode("//oaf:journal");
|
||||
if (n != null) {
|
||||
|
|
|
@ -5,12 +5,10 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
|||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.net.URLDecoder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Element;
|
||||
|
@ -25,6 +23,8 @@ import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
|||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||
|
||||
import static org.apache.commons.lang3.StringUtils.contains;
|
||||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
|
||||
|
@ -139,6 +139,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
||||
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
||||
|
||||
instance.setInstanceTypeMapping(prepareInstanceTypeMapping(doc));
|
||||
|
||||
final Set<StructuredProperty> pids = new HashSet<>(pid);
|
||||
|
||||
instance
|
||||
|
@ -187,6 +189,28 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
return Lists.newArrayList(instance);
|
||||
}
|
||||
|
||||
/**
|
||||
* The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value
|
||||
* So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the
|
||||
* openaire's info:eu-repo type, and as last resort picks the 1st type text available
|
||||
*
|
||||
* <dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
|
||||
* <dc:type>info:eu-repo/semantics/article</dc:type>
|
||||
* <dc:type>Conference article</dc:type>
|
||||
*
|
||||
* @param doc the input document
|
||||
* @return the chosen resource type
|
||||
*/
|
||||
@Override
|
||||
protected String findOriginalType(Document doc) {
|
||||
return (String) doc.selectNodes("//dc:type")
|
||||
.stream()
|
||||
.map(o -> "" + ((Node) o).getText().trim())
|
||||
.sorted(new OriginalTypeComparator())
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:source", info);
|
||||
|
|
|
@ -9,6 +9,7 @@ import java.net.URLDecoder;
|
|||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.ObjectUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Element;
|
||||
|
@ -139,6 +140,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
||||
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
||||
|
||||
instance.setInstanceTypeMapping(prepareInstanceTypeMapping(doc));
|
||||
|
||||
final Set<StructuredProperty> pids = new HashSet<>(pid);
|
||||
|
||||
instance
|
||||
|
@ -217,6 +220,30 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<InstanceTypeMapping> prepareInstanceTypeMapping(Document doc) {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* The Datacite element
|
||||
*
|
||||
* <datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_6501">journal article</datacite:resourceType>
|
||||
*
|
||||
* @param doc the input document
|
||||
* @return the chosen resource type
|
||||
*/
|
||||
@Override
|
||||
protected String findOriginalType(Document doc) {
|
||||
final Element resourceType = (Element) doc.selectSingleNode(
|
||||
"//metadata/*[local-name() = 'resource']/*[local-name() = 'resourceType']");
|
||||
|
||||
final String resourceTypeURI = resourceType.attributeValue("anyURI");
|
||||
final String resourceTypeTxt = resourceType.getText();
|
||||
|
||||
return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import static org.apache.commons.lang3.StringUtils.contains;
|
||||
import static org.apache.commons.lang3.StringUtils.startsWith;
|
||||
|
||||
public class OriginalTypeComparator implements Comparator<String> {
|
||||
|
||||
@Override
|
||||
public int compare(String t1, String t2) {
|
||||
|
||||
if (t1.equals(t2)) {
|
||||
return 0;
|
||||
}
|
||||
if (startsWith(t1, "http") && contains(t1, "coar") && contains(t1, "resource_type")) {
|
||||
return -1;
|
||||
}
|
||||
if (startsWith(t2, "http") && contains(t2, "coar") && contains(t2, "resource_type")) {
|
||||
return 1;
|
||||
}
|
||||
if (startsWith(t1, "info:eu-repo/semantics")) {
|
||||
return -1;
|
||||
}
|
||||
if (startsWith(t2, "info:eu-repo/semantics")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return t1.compareTo(t2);
|
||||
}
|
||||
|
||||
}
|
|
@ -116,6 +116,27 @@ class MappersTest {
|
|||
assertNotNull(instance.getPid());
|
||||
assertTrue(instance.getPid().isEmpty());
|
||||
|
||||
assertNotNull(instance.getInstanceTypeMapping());
|
||||
assertEquals(2, instance.getInstanceTypeMapping().size());
|
||||
|
||||
Optional<InstanceTypeMapping> coarType = instance.getInstanceTypeMapping()
|
||||
.stream()
|
||||
.filter(itm -> AbstractMdRecordToOafMapper.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(itm.getVocabularyName()))
|
||||
.findFirst();
|
||||
|
||||
assertTrue(coarType.isPresent());
|
||||
assertEquals("http://purl.org/coar/resource_type/c_5794", coarType.get().getTypeCode());
|
||||
assertEquals("conference paper", coarType.get().getTypeLabel());
|
||||
|
||||
Optional<InstanceTypeMapping> userType = instance.getInstanceTypeMapping()
|
||||
.stream()
|
||||
.filter(itm -> AbstractMdRecordToOafMapper.OPENAIRE_USER_RESOURCE_TYPES.equals(itm.getVocabularyName()))
|
||||
.findFirst();
|
||||
|
||||
assertTrue(userType.isPresent());
|
||||
assertEquals("Article", userType.get().getTypeCode());
|
||||
assertEquals("Article", userType.get().getTypeLabel());
|
||||
|
||||
assertFalse(instance.getAlternateIdentifier().isEmpty());
|
||||
assertEquals("doi", instance.getAlternateIdentifier().get(0).getQualifier().getClassid());
|
||||
assertEquals("10.3897/oneeco.2.e13718", instance.getAlternateIdentifier().get(0).getValue());
|
||||
|
|
|
@ -1246,3 +1246,8 @@ dnet:relation_subRelType @=@ relationship @=@ publicationDataset
|
|||
dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned
|
||||
FOS @=@ 0101 mathematics @=@ FOS: Mathematics
|
||||
FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences
|
||||
openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Proceedings paper
|
||||
openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Conference article
|
||||
openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ http://purl.org/eprint/type/ConferencePaper
|
||||
openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ Conference article
|
||||
openaire::user_resource_types @=@ Article @=@ http://purl.org/coar/resource_type/c_5794
|
|
@ -1122,3 +1122,5 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
|
|||
FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics
|
||||
FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences
|
||||
FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences
|
||||
openaire::coar_resource_types_3_1 @=@ openaire::coar_resource_types_3_1 @=@ http://purl.org/coar/resource_type/c_5794 @=@ conference paper
|
||||
openaire::user_resource_types @=@ openaire::user_resource_types @=@ Article @=@ Article
|
|
@ -47,7 +47,9 @@
|
|||
<dc:subject>provisioning services</dc:subject>
|
||||
<dc:subject>regulating services</dc:subject>
|
||||
<dc:subject>supporting services</dc:subject>
|
||||
<dc:type>Research Article</dc:type>
|
||||
<dc:type>conference paper</dc:type>
|
||||
<dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
|
||||
<dc:type>info:eu-repo/semantics/article</dc:type>
|
||||
<!--<dr:CobjCategory type="publication">0001</dr:CobjCategory>-->
|
||||
<dr:CobjCategory>0001</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -888,7 +888,7 @@
|
|||
<mockito-core.version>3.3.3</mockito-core.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<dhp-schemas.version>[3.17.1]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[4.17.2-SNAPSHOT]</dhp-schemas.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
|
|
Loading…
Reference in New Issue