[raw graph] mapping original resource types

This commit is contained in:
Claudio Atzori 2023-10-16 12:57:18 +02:00
parent 39d24d5469
commit 6dfcd0c9a2
6 changed files with 308 additions and 97 deletions

View File

@ -157,20 +157,20 @@ public abstract class AbstractMdRecordToOafMapper {
if (vocs.vocabularyExists(OPENAIRE_META_RESOURCE_TYPE)) {
Optional<InstanceTypeMapping> instanceTypeMapping = instances
.stream()
.flatMap(i -> i.getInstanceTypeMapping().stream())
.filter(t -> OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(t.getVocabularyName()))
.findFirst();
.stream()
.flatMap(i -> i.getInstanceTypeMapping().stream())
.filter(t -> OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(t.getVocabularyName()))
.findFirst();
if (!instanceTypeMapping.isPresent()) {
throw new IllegalStateException("unable to find an instance from " + OPENAIRE_COAR_RESOURCE_TYPES_3_1);
return null;
} else {
final String typeCode = instanceTypeMapping.get().getTypeCode();
return Optional
.ofNullable(vocs.lookupTermBySynonym(OPENAIRE_META_RESOURCE_TYPE, typeCode))
.orElseThrow(() ->
new IllegalStateException("unable to find a synonym for '" + typeCode + "' in " +
OPENAIRE_META_RESOURCE_TYPE));
.ofNullable(vocs.lookupTermBySynonym(OPENAIRE_META_RESOURCE_TYPE, typeCode))
.orElseThrow(
() -> new IllegalStateException("unable to find a synonym for '" + typeCode + "' in " +
OPENAIRE_META_RESOURCE_TYPE));
}
} else {
throw new IllegalStateException("vocabulary '" + OPENAIRE_META_RESOURCE_TYPE + "' not available");
@ -197,7 +197,8 @@ public abstract class AbstractMdRecordToOafMapper {
final DataInfo info,
final long lastUpdateTimestamp) {
final OafEntity entity = createEntity(doc, type, metaResourceType, instances, collectedFrom, info, lastUpdateTimestamp);
final OafEntity entity = createEntity(
doc, type, metaResourceType, instances, collectedFrom, info, lastUpdateTimestamp);
final Set<String> originalId = Sets.newHashSet(entity.getOriginalId());
originalId.add(entity.getId());
@ -550,29 +551,33 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract String findOriginalType(Document doc);
protected List<InstanceTypeMapping> prepareInstanceTypeMapping(Document doc) {
return Optional.ofNullable(findOriginalType(doc))
.map(originalType -> {
final List<InstanceTypeMapping> mappings = Lists.newArrayList();
return Optional
.ofNullable(findOriginalType(doc))
.map(originalType -> {
final List<InstanceTypeMapping> mappings = Lists.newArrayList();
if (vocs.vocabularyExists(OPENAIRE_COAR_RESOURCE_TYPES_3_1)) {
if (vocs.vocabularyExists(OPENAIRE_COAR_RESOURCE_TYPES_3_1)) {
// TODO verify what the vocabs return when a synonym is not defined
Optional.ofNullable(vocs.lookupTermBySynonym(OPENAIRE_COAR_RESOURCE_TYPES_3_1, originalType))
.ifPresent(coarTerm -> {
mappings.add(OafMapperUtils.instanceTypeMapping(originalType, coarTerm));
if (vocs.vocabularyExists(OPENAIRE_USER_RESOURCE_TYPES)) {
// TODO verify what the vocabs return when a synonym is not defined
Optional
.ofNullable(vocs.lookupTermBySynonym(OPENAIRE_COAR_RESOURCE_TYPES_3_1, originalType))
.ifPresent(coarTerm -> {
mappings.add(OafMapperUtils.instanceTypeMapping(originalType, coarTerm));
if (vocs.vocabularyExists(OPENAIRE_USER_RESOURCE_TYPES)) {
// TODO verify what the vocabs return when a synonym is not defined
Optional
.ofNullable(vocs.lookupTermBySynonym(OPENAIRE_USER_RESOURCE_TYPES, coarTerm.getClassid()))
.ifPresent(type -> mappings.add(OafMapperUtils.instanceTypeMapping(originalType, type)));
}
});
}
// TODO verify what the vocabs return when a synonym is not defined
Optional
.ofNullable(
vocs.lookupTermBySynonym(OPENAIRE_USER_RESOURCE_TYPES, coarTerm.getClassid()))
.ifPresent(
type -> mappings.add(OafMapperUtils.instanceTypeMapping(originalType, type)));
}
});
}
return mappings;
})
.orElse(new ArrayList<>());
return mappings;
})
.orElse(new ArrayList<>());
}
private Journal prepareJournal(final Document doc, final DataInfo info) {

View File

@ -1,17 +1,8 @@
package eu.dnetlib.dhp.oa.graph.raw;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.net.URLDecoder;
import java.util.ArrayList;
@ -20,8 +11,19 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@ -201,12 +203,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
*/
@Override
protected String findOriginalType(Document doc) {
return (String) doc.selectNodes("//dc:type")
.stream()
.map(o -> "" + ((Node) o).getText().trim())
.sorted(new OriginalTypeComparator())
.findFirst()
.orElse(null);
return (String) doc
.selectNodes("//dc:type")
.stream()
.map(o -> "" + ((Node) o).getText().trim())
.sorted(new OriginalTypeComparator())
.findFirst()
.orElse(null);
}
@Override

View File

@ -230,20 +230,18 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
*/
@Override
protected String findOriginalType(Document doc) {
String resourceType = Optional.ofNullable((Element) doc.selectSingleNode(
return Optional
.ofNullable(
(Element) doc
.selectSingleNode(
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
.map(element -> {
final String resourceTypeURI = element.attributeValue("anyURI");
final String resourceTypeTxt = element.getText();
return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt);
})
.orElse(doc.valueOf(
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'CobjCategory']/text()")
);
return resourceType;
.map(element -> {
final String resourceTypeURI = element.attributeValue("anyURI");
final String resourceTypeTxt = element.getText();
return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt);
})
.orElse(null);
}
@Override

View File

@ -1,32 +1,33 @@
package eu.dnetlib.dhp.oa.graph.raw;
import java.util.Comparator;
package eu.dnetlib.dhp.oa.graph.raw;
import static org.apache.commons.lang3.StringUtils.contains;
import static org.apache.commons.lang3.StringUtils.startsWith;
import java.util.Comparator;
public class OriginalTypeComparator implements Comparator<String> {
@Override
public int compare(String t1, String t2) {
@Override
public int compare(String t1, String t2) {
if (t1.equals(t2)) {
return 0;
}
if (startsWith(t1, "http") && contains(t1, "coar") && contains(t1, "resource_type")) {
return -1;
}
if (startsWith(t2, "http") && contains(t2, "coar") && contains(t2, "resource_type")) {
return 1;
}
if (startsWith(t1, "info:eu-repo/semantics")) {
return -1;
}
if (startsWith(t2, "info:eu-repo/semantics")) {
return 1;
}
if (t1.equals(t2)) {
return 0;
}
if (startsWith(t1, "http") && contains(t1, "coar") && contains(t1, "resource_type")) {
return -1;
}
if (startsWith(t2, "http") && contains(t2, "coar") && contains(t2, "resource_type")) {
return 1;
}
if (startsWith(t1, "info:eu-repo/semantics")) {
return -1;
}
if (startsWith(t2, "info:eu-repo/semantics")) {
return 1;
}
return t1.compareTo(t2);
}
return t1.compareTo(t2);
}
}

View File

@ -14,6 +14,8 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.dom4j.DocumentException;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -25,8 +27,11 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.oa.graph.clean.CleaningRuleMap;
import eu.dnetlib.dhp.oa.graph.clean.OafCleaner;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -119,19 +124,21 @@ class MappersTest {
assertNotNull(instance.getInstanceTypeMapping());
assertEquals(2, instance.getInstanceTypeMapping().size());
Optional<InstanceTypeMapping> coarType = instance.getInstanceTypeMapping()
.stream()
.filter(itm -> ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(itm.getVocabularyName()))
.findFirst();
Optional<InstanceTypeMapping> coarType = instance
.getInstanceTypeMapping()
.stream()
.filter(itm -> ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(itm.getVocabularyName()))
.findFirst();
assertTrue(coarType.isPresent());
assertEquals("http://purl.org/coar/resource_type/c_5794", coarType.get().getTypeCode());
assertEquals("conference paper", coarType.get().getTypeLabel());
Optional<InstanceTypeMapping> userType = instance.getInstanceTypeMapping()
.stream()
.filter(itm -> ModelConstants.OPENAIRE_USER_RESOURCE_TYPES.equals(itm.getVocabularyName()))
.findFirst();
Optional<InstanceTypeMapping> userType = instance
.getInstanceTypeMapping()
.stream()
.filter(itm -> ModelConstants.OPENAIRE_USER_RESOURCE_TYPES.equals(itm.getVocabularyName()))
.findFirst();
assertTrue(userType.isPresent());
assertEquals("Article", userType.get().getTypeCode());
@ -266,8 +273,8 @@ class MappersTest {
});
Publication p_cleaned = cleanup(p, vocs);
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
assertEquals("0002", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("nonPeerReviewed", p_cleaned.getInstance().get(0).getRefereed().getClassname());
assertNotNull(p.getInstance().get(0).getPid());
assertEquals(2, p.getInstance().get(0).getPid().size());
@ -485,8 +492,8 @@ class MappersTest {
});
Publication p_cleaned = cleanup(p, vocs);
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
assertEquals("0002", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("nonPeerReviewed", p_cleaned.getInstance().get(0).getRefereed().getClassname());
}
@Test
@ -604,8 +611,137 @@ class MappersTest {
assertTrue(i.getUrl().contains("https://clinicaltrials.gov/ct2/show/NCT02321059"));
Dataset d_cleaned = cleanup(d, vocs);
assertEquals("0000", d_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("Unknown", d_cleaned.getInstance().get(0).getRefereed().getClassname());
assertEquals("0002", d_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("nonPeerReviewed", d_cleaned.getInstance().get(0).getRefereed().getClassname());
}
@Test
void test_record_from_Crossref() throws IOException {
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_crossref.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Publication);
final Publication p = OafCleaner.apply(fixVocabularyNames((Publication) list.get(0)), mapping);
assertNotNull(p.getDateofcollection());
assertEquals("2020-08-06T07:04:09.62Z", p.getDateofcollection());
assertNotNull(p.getDateoftransformation());
assertEquals("2020-08-06T07:20:57.911Z", p.getDateoftransformation());
assertNotNull(p.getDataInfo());
assertFalse(p.getDataInfo().getInvisible());
assertFalse(p.getDataInfo().getDeletedbyinference());
assertEquals("0.9", p.getDataInfo().getTrust());
assertValidId(p.getId());
assertEquals(2, p.getOriginalId().size());
assertEquals("50|doi_________::7f0f7807f17db50e5c2b5c452ccaf06d", p.getOriginalId().get(0));
assertValidId(p.getCollectedfrom().get(0).getKey());
assertNotNull(p.getTitle());
assertEquals(1, p.getTitle().size());
assertEquals(
"A case report of serious haemolysis in a glucose-6-phosphate dehydrogenase-deficient COVID-19 patient receiving hydroxychloroquine",
p
.getTitle()
.get(0)
.getValue());
assertNotNull(p.getDescription());
assertEquals(0, p.getDescription().size());
assertEquals(8, p.getAuthor().size());
assertNotNull(p.getInstance());
assertEquals(1, p.getInstance().size());
final Instance i = p.getInstance().get(0);
assertNotNull(i.getAccessright());
assertEquals(ModelConstants.DNET_ACCESS_MODES, i.getAccessright().getSchemeid());
assertEquals(ModelConstants.DNET_ACCESS_MODES, i.getAccessright().getSchemename());
assertEquals("OPEN", i.getAccessright().getClassid());
assertEquals("Open Access", i.getAccessright().getClassname());
assertNotNull(i.getCollectedfrom());
assertEquals("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", i.getCollectedfrom().getKey());
assertEquals("Crossref", i.getCollectedfrom().getValue());
assertNotNull(i.getHostedby());
assertEquals("10|openaire____::55045bd2a65019fd8e6741a755395c8c", i.getHostedby().getKey());
assertEquals("Unknown Repository", i.getHostedby().getValue());
assertNotNull(i.getInstancetype());
assertEquals("0001", i.getInstancetype().getClassid());
assertEquals("Article", i.getInstancetype().getClassname());
assertEquals(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getSchemeid());
assertEquals(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getSchemename());
assertNull(i.getLicense());
assertNotNull(i.getDateofacceptance());
assertEquals("2020-06-04", i.getDateofacceptance().getValue());
assertNull(i.getProcessingchargeamount());
assertNull(i.getProcessingchargecurrency());
assertNotNull(i.getPid());
assertEquals(1, i.getPid().size());
assertNotNull(i.getAlternateIdentifier());
assertEquals(0, i.getAlternateIdentifier().size());
assertNotNull(i.getUrl());
assertEquals(1, i.getUrl().size());
assertTrue(i.getUrl().contains("http://dx.doi.org/10.1080/23744235.2020.1774644"));
assertEquals("", p.getInstance().get(0).getRefereed().getClassid());
assertEquals("", p.getInstance().get(0).getRefereed().getClassname());
Publication p_cleaned = cleanup(p, vocs);
assertEquals("0001", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("peerReviewed", p_cleaned.getInstance().get(0).getRefereed().getClassname());
assertNotNull(p_cleaned.getMetaResourceType());
assertEquals("Research Literature", p_cleaned.getMetaResourceType().getClassid());
assertEquals("Research Literature", p_cleaned.getMetaResourceType().getClassname());
assertEquals(ModelConstants.OPENAIRE_META_RESOURCE_TYPE, p_cleaned.getMetaResourceType().getSchemeid());
assertEquals(ModelConstants.OPENAIRE_META_RESOURCE_TYPE, p_cleaned.getMetaResourceType().getSchemename());
assertNotNull(p_cleaned.getInstance().get(0).getInstanceTypeMapping());
assertEquals(2, p_cleaned.getInstance().get(0).getInstanceTypeMapping().size());
assertTrue(
p_cleaned
.getInstance()
.get(0)
.getInstanceTypeMapping()
.stream()
.anyMatch(
t -> "journal-article".equals(t.getOriginalType()) &&
ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(t.getVocabularyName()) &&
"http://purl.org/coar/resource_type/c_2df8fbb1".equals(t.getTypeCode()) &&
"research article".equals(t.getTypeLabel())));
assertTrue(
p_cleaned
.getInstance()
.get(0)
.getInstanceTypeMapping()
.stream()
.anyMatch(
t -> "journal-article".equals(t.getOriginalType()) &&
ModelConstants.OPENAIRE_USER_RESOURCE_TYPES.equals(t.getVocabularyName()) &&
"Article".equals(t.getTypeCode()) &&
"Article".equals(t.getTypeLabel())));
}
@Test
@ -908,8 +1044,8 @@ class MappersTest {
});
Dataset p_cleaned = cleanup(p, vocs);
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
assertEquals("0002", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("nonPeerReviewed", p_cleaned.getInstance().get(0).getRefereed().getClassname());
}
@Test

View File

@ -0,0 +1,68 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>doi_________::7f0f7807f17db50e5c2b5c452ccaf06d</dri:objIdentifier>
<dri:recordIdentifier>doi_________::7f0f7807f17db50e5c2b5c452ccaf06d</dri:recordIdentifier>
<dri:dateOfCollection>2020-08-06T07:04:09.62Z</dri:dateOfCollection>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection/>
<dr:dateOfTransformation>2020-08-06T07:20:57.911Z</dr:dateOfTransformation>
<oaf:datasourceprefix>openaire____</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:title>A case report of serious haemolysis in a glucose-6-phosphate dehydrogenase-deficient COVID-19 patient receiving hydroxychloroquine</dc:title>
<dc:creator>Maillart, E.</dc:creator>
<dc:creator>Leemans, S.</dc:creator>
<dc:creator>Van Noten, H.</dc:creator>
<dc:creator>Vandergraesen, T.</dc:creator>
<dc:creator>Mahadeb, B.</dc:creator>
<dc:creator>Salaouatchi, M. T.</dc:creator>
<dc:creator>De Bels, D.</dc:creator>
<dc:creator>Clevenbergh, P.</dc:creator>
<dc:date/>
<dc:identifier>http://dx.doi.org/10.1080/23744235.2020.1774644</dc:identifier>
<dc:language/>
<dc:publisher>Informa UK Limited</dc:publisher>
<dc:source>Crossref</dc:source>
<dc:source>Infectious Diseases</dc:source>
<dc:subject>Microbiology (medical)</dc:subject>
<dc:subject>General Immunology and Microbiology</dc:subject>
<dc:subject>Infectious Diseases</dc:subject>
<dc:subject>General Medicine</dc:subject>
<dc:type>journal-article</dc:type>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:dateAccepted>2020-06-04</oaf:dateAccepted>
<oaf:projectid/>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy
id="openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18" name="Unknown Repository"/>
<oaf:collectedFrom id="openaire____::crossref" name="Crossref"/>
<oaf:identifier identifierType="doi">10.1080/23744235.2020.1774644</oaf:identifier>
<oaf:journal eissn="2374-4243" ep="3" iss="" issn="2374-4235" sp="1" vol="">Infectious Diseases</oaf:journal>
</metadata>
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2020-08-06T07:04:09.62Z">
<baseURL>file%3A%2F%2F%2Fsrv%2Fclaims%2Frecords%2Fpublication%2Fcrossref</baseURL>
<identifier/>
<datestamp/>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="user:claim" classname="user:claim"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>