From 982bcc1e3535e15dcaeca2cd14551d216e9356c8 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Fri, 23 Sep 2022 12:06:06 +0200 Subject: [PATCH 1/5] test wrid pid and record identifier --- .../dhp/oa/graph/raw/OdfToOafMapper.java | 33 +++++++++++++++++++ .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 13 ++++++-- pom.xml | 2 +- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 5781988e62..7d615a1f09 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -391,6 +391,39 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final String docId = entity.getId(); final List res = new ArrayList<>(); + /* + /* + + https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/b4be0f3e-41d7-471f-b34e-f0bd54ff5698 + https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/5d6e575b-ef84-417a-9d76-61c6702f7cb2 + https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/35e01545-8c6d-49bd-ab98-5c152df69934 + + We could extend it to create the relationships targeting w3id, dois, pmcids and other pid types for which we know how to build the target openaire identifier "blindly". + + + + for (final Object o : doc + .selectNodes("//*[local-name()='relatedIdentifier']")) { + + final String originalId = ((Node) o).getText(); + + if (StringUtils.isNotBlank(originalId)) { + final String otherId = createOpenaireId(50, originalId, false); + final String type = ((Node) o).valueOf("@relationType"); + switch(type){ + case IS_SUPPLEMENT_TO: + break; + case SUPPLEMENT: + break; + case IS_PART_OF: + break; + case HAS_PART: + break; + + + } + + */ for (final Object o : doc .selectNodes("//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 64b68e6af1..b163ecff3a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -912,13 +912,13 @@ class MappersTest { } @Test - void testROHub() throws IOException, DocumentException { + void testROHub() throws IOException { final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub.xml"))); final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println("***************"); -// final Dataset p = (Dataset) list.get(0); +// final OtherResearchProduct p = (OtherResearchProduct) list.get(0); // assertValidId(p.getId()); // assertValidId(p.getCollectedfrom().get(0).getKey()); // System.out.println(p.getTitle().get(0).getValue()); @@ -926,13 +926,20 @@ class MappersTest { } @Test - void testROHub2() throws IOException, DocumentException { + void testROHub2() throws IOException { final String xml = IOUtils .toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml"))); final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println("***************"); + final OtherResearchProduct p = (OtherResearchProduct) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertEquals("50|w3id________::afc7592914ae190a50570db90f55f9c2", p.getId()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertEquals("w3id", (p.getPid().get(0).getQualifier().getClassid())); + assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", (p.getPid().get(0).getValue())); } @Test diff --git a/pom.xml b/pom.xml index a1b26966e1..55265bf555 100644 --- a/pom.xml +++ b/pom.xml @@ -807,7 +807,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [3.14.0] + [3.14.0-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6] From ba33ff71fdc9eaf142d329ae5430e6ef8f81474b Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Fri, 23 Sep 2022 15:17:13 +0200 Subject: [PATCH 2/5] refactoring for the generation of relationships from related identifier of type 'OPENAIRE' --- .../raw/MigrateDbEntitiesApplication.java | 3 +- .../dhp/oa/graph/raw/OdfToOafMapper.java | 117 +++++++++--------- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 1 + .../dhp/oa/graph/raw/rohub-modified.xml | 3 + 4 files changed, 66 insertions(+), 58 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 2b1c257ad4..c69a7a6ffa 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -422,7 +422,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final Relation r2 = OafMapperUtils .getRelation( - orgId, dsId, DATASOURCE_ORGANIZATION, PROVISION, PROVIDES, collectedFrom, info, lastUpdateTimestamp); + orgId, dsId, DATASOURCE_ORGANIZATION, PROVISION, PROVIDES, collectedFrom, info, + lastUpdateTimestamp); return Arrays.asList(r1, r2); } catch (final Exception e) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 7d615a1f09..fc063a242c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -25,6 +25,7 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; +import eu.dnetlib.dhp.schema.oaf.utils.PidType; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @@ -391,75 +392,77 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final String docId = entity.getId(); final List res = new ArrayList<>(); - /* - /* - - https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/b4be0f3e-41d7-471f-b34e-f0bd54ff5698 - https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/5d6e575b-ef84-417a-9d76-61c6702f7cb2 - https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/35e01545-8c6d-49bd-ab98-5c152df69934 - - We could extend it to create the relationships targeting w3id, dois, pmcids and other pid types for which we know how to build the target openaire identifier "blindly". - - for (final Object o : doc - .selectNodes("//*[local-name()='relatedIdentifier']")) { + .selectNodes("//*[local-name()='relatedIdentifier']")) { - final String originalId = ((Node) o).getText(); + final String originalId = ((Node) o).getText().trim(); if (StringUtils.isNotBlank(originalId)) { - final String otherId = createOpenaireId(50, originalId, false); - final String type = ((Node) o).valueOf("@relationType"); - switch(type){ - case IS_SUPPLEMENT_TO: - break; - case SUPPLEMENT: - break; - case IS_PART_OF: - break; - case HAS_PART: - break; + final String idType = ((Node) o).valueOf("@relatedIdentifierType"); + final String reltype = ((Node) o).valueOf("@relationType"); + String otherId = guessRelatedIdentifier(idType, originalId); + if (StringUtils.isNotBlank(otherId)) { + if (reltype.equalsIgnoreCase(IS_SUPPLEMENT_TO)) { + res + .add( + getRelation( + docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity)); + res + .add( + getRelation( + otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity)); + } else { + if (reltype.equalsIgnoreCase(IS_SUPPLEMENTED_BY)) { + res + .add( + getRelation( + otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity)); + res + .add( + getRelation( + docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity)); + } else { + if (reltype.equalsIgnoreCase(IS_PART_OF)) { + res + .add( + getRelation( + docId, otherId, RESULT_RESULT, PART, IS_PART_OF, entity)); + res + .add( + getRelation( + otherId, docId, RESULT_RESULT, PART, HAS_PART, entity)); + } else { + if (reltype.equalsIgnoreCase(HAS_PART)) { + res + .add( + getRelation( + otherId, docId, RESULT_RESULT, PART, IS_PART_OF, entity)); + res + .add( + getRelation( + docId, otherId, RESULT_RESULT, PART, HAS_PART, entity)); + } + // else TODO catch more semantics + } + } + } - - } - - */ - - for (final Object o : doc - .selectNodes("//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) { - - final String originalId = ((Node) o).getText(); - - if (StringUtils.isNotBlank(originalId)) { - final String otherId = createOpenaireId(50, originalId, false); - final String type = ((Node) o).valueOf("@relationType"); - - if (type.equalsIgnoreCase(IS_SUPPLEMENT_TO)) { - res - .add( - getRelation( - docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity)); - res - .add( - getRelation( - otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity)); - } else if (type.equalsIgnoreCase(IS_PART_OF)) { - res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PART, IS_PART_OF, entity)); - res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PART, HAS_PART, entity)); - } else { - // TODO catch more semantics } } } return res; } + protected String guessRelatedIdentifier(final String idType, final String value) { + if (StringUtils.isBlank(idType) || StringUtils.isBlank(value)) + return null; + if (idType.equalsIgnoreCase("OPENAIRE")) { + return createOpenaireId(50, value, false); + } else + return null; + } + @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return prepareQualifier( diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index b163ecff3a..fbfbf5af8a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -933,6 +933,7 @@ class MappersTest { System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println("***************"); + assertEquals(3, list.size()); final OtherResearchProduct p = (OtherResearchProduct) list.get(0); assertValidId(p.getId()); assertValidId(p.getCollectedfrom().get(0).getKey()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub-modified.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub-modified.xml index 95d65ac8d5..ce846a2cf9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub-modified.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub-modified.xml @@ -30,6 +30,9 @@ https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/6d3427a8-352e-49f4-9796-f618c44dc16d + + fsh_____4119::afc7592914ae190a50570db90f55f9c3 + RO-crate From c5eb72217047e15fea0b8d9f0c46fb4acdae45bc Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Fri, 23 Sep 2022 15:47:05 +0200 Subject: [PATCH 3/5] relationships from relatedIdentifier whose target id type is one of the pid type with an authority --- .../raw/AbstractMdRecordToOafMapper.java | 1206 +++++++++-------- .../dhp/oa/graph/raw/OdfToOafMapper.java | 10 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 17 +- 3 files changed, 630 insertions(+), 603 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index a8d09e4a7f..bb5472a882 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -15,6 +15,7 @@ import java.net.URL; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.oaf.utils.PidType; import org.apache.commons.lang3.StringUtils; import org.apache.commons.validator.routines.UrlValidator; import org.dom4j.*; @@ -33,603 +34,612 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; public abstract class AbstractMdRecordToOafMapper { - protected final VocabularyGroup vocs; - - private final boolean invisible; - - private final boolean shouldHashId; - - private final boolean forceOriginalId; - - protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; - protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; - protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; - protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/"; - protected static final Qualifier ORCID_PID_TYPE = qualifier( - ModelConstants.ORCID_PENDING, - ModelConstants.ORCID_CLASSNAME, - DNET_PID_TYPES, DNET_PID_TYPES); - protected static final Qualifier MAG_PID_TYPE = qualifier( - "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); - - protected static final String DEFAULT_TRUST_FOR_VALIDATED_RELS = "0.999"; - - protected static final Map nsContext = new HashMap<>(); - - private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class); - - static { - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); - } - - protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, - final boolean shouldHashId, final boolean forceOriginalId) { - this.vocs = vocs; - this.invisible = invisible; - this.shouldHashId = shouldHashId; - this.forceOriginalId = forceOriginalId; - } - - protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, - final boolean shouldHashId) { - this.vocs = vocs; - this.invisible = invisible; - this.shouldHashId = shouldHashId; - this.forceOriginalId = false; - } - - public List processMdRecord(final String xml) { - - DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); - try { - final Document doc = DocumentHelper - .parseText( - xml - .replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3) - .replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3) - .replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); - - final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - - if (collectedFrom == null) { - return Lists.newArrayList(); - } - - final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - - if (hostedBy == null) { - return Lists.newArrayList(); - } - - final DataInfo info = prepareDataInfo(doc, invisible); - final long lastUpdateTimestamp = new Date().getTime(); - - final List instances = prepareInstances(doc, info, collectedFrom, hostedBy); - - final String type = getResultType(doc, instances); - - return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); - } catch (DocumentException e) { - log.error("Error with record:\n" + xml); - return Lists.newArrayList(); - } - } - - protected String getResultType(final Document doc, final List instances) { - final String type = doc.valueOf("//dr:CobjCategory/@type"); - - if (StringUtils.isBlank(type) && vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { - final String instanceType = instances - .stream() - .map(i -> i.getInstancetype().getClassid()) - .findFirst() - .filter(s -> !UNKNOWN.equalsIgnoreCase(s)) - .orElse("0000"); // Unknown - return Optional - .ofNullable(vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) - .map(Qualifier::getClassid) - .orElse("0000"); - } - - return type; - } - - private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) { - final String dsId = doc.valueOf(xpathId); - final String dsName = doc.valueOf(xpathName); - - if (StringUtils.isBlank(dsId) || StringUtils.isBlank(dsName)) { - return null; - } - - return keyValue(createOpenaireId(10, dsId, true), dsName); - } - - protected List createOafs( - final Document doc, - final String type, - final List instances, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - - final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); - - final Set originalId = Sets.newHashSet(entity.getOriginalId()); - originalId.add(entity.getId()); - entity.setOriginalId(Lists.newArrayList(originalId)); - - if (!forceOriginalId) { - final String id = IdentifierFactory.createIdentifier(entity, shouldHashId); - if (!id.equals(entity.getId())) { - entity.setId(id); - } - } - - final List oafs = Lists.newArrayList(entity); - - if (!oafs.isEmpty()) { - Set rels = Sets.newHashSet(); - - rels.addAll(addProjectRels(doc, entity)); - rels.addAll(addOtherResultRels(doc, entity)); - rels.addAll(addRelations(doc, entity)); - - oafs.addAll(rels); - } - - return oafs; - } - - private OafEntity createEntity(final Document doc, - final String type, - final List instances, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - switch (type.toLowerCase()) { - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp); - p.setJournal(prepareJournal(doc, info)); - return p; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - return d; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - return s; - case "": - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - return o; - } - } - - private List addProjectRels( - final Document doc, - final OafEntity entity) { - - final List res = new ArrayList<>(); - - final String docId = entity.getId(); - - for (final Object o : doc.selectNodes("//oaf:projectid")) { - - final String originalId = ((Node) o).getText(); - - final String validationdDate = ((Node) o).valueOf("@validationDate"); - - if (StringUtils.isNotBlank(originalId)) { - final String projectId = createOpenaireId(40, originalId, true); - - res - .add( - OafMapperUtils - .getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate)); - res - .add( - OafMapperUtils - .getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate)); - } - } - - return res; - } - - private List addRelations(Document doc, OafEntity entity) { - - final List rels = Lists.newArrayList(); - - for (Object o : doc.selectNodes("//oaf:relation")) { - Element element = (Element) o; - - final String target = StringUtils.trim(element.getText()); - final String relType = element.attributeValue("relType"); - final String subRelType = element.attributeValue("subRelType"); - final String relClass = element.attributeValue("relClass"); - - if (StringUtils.isNotBlank(target) && StringUtils.isNotBlank(relType) && StringUtils.isNotBlank(subRelType) - && StringUtils.isNotBlank(relClass)) { - - final String relClassInverse = ModelSupport - .findInverse(ModelSupport.rel(relType, subRelType, relClass)) - .getInverseRelClass(); - final String validationdDate = ((Node) o).valueOf("@validationDate"); - - if (StringUtils.isNotBlank(target)) { - final String targetType = element.attributeValue("targetType"); - if (StringUtils.isNotBlank(targetType)) { - final String targetId = createOpenaireId(targetType, target, true); - rels - .add( - OafMapperUtils - .getRelation( - entity.getId(), targetId, relType, subRelType, relClass, entity, - validationdDate)); - rels - .add( - OafMapperUtils - .getRelation( - targetId, entity.getId(), relType, subRelType, relClassInverse, entity, - validationdDate)); - } - } - } - } - return rels; - } - - protected abstract List addOtherResultRels( - final Document doc, - final OafEntity entity); - - private void populateResultFields( - final Result r, - final Document doc, - final List instances, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); - r.setOriginalId(findOriginalId(doc)); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setPid(IdentifierFactory.getPids(prepareResultPids(doc, info), collectedFrom)); - r.setDateofcollection(doc.valueOf("//dr:dateOfCollection/text()|//dri:dateOfCollection/text()")); - r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation/text()|//dri:dateOfTransformation/text()")); - r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setOaiprovenance(prepareOAIprovenance(doc)); - r.setAuthor(prepareAuthors(doc, info)); - r.setLanguage(prepareLanguages(doc)); - r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setSubject(prepareSubjects(doc, info)); - r.setTitle(prepareTitles(doc, info)); - r.setRelevantdate(prepareRelevantDates(doc, info)); - r.setDescription(prepareDescriptions(doc, info)); - r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); - r.setPublisher(preparePublisher(doc, info)); - r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); - r.setSource(prepareSources(doc, info)); - r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info)); - r.setFormat(prepareFormats(doc, info)); - r.setContributor(prepareContributors(doc, info)); - r.setResourcetype(prepareResourceType(doc, info)); - r.setCoverage(prepareCoverages(doc, info)); - r.setContext(prepareContexts(doc, info)); - r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r - .setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); - r - .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - - r.setInstance(instances); - r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances)); - } - - protected abstract List prepareResultPids(Document doc, DataInfo info); - - private List prepareContexts(final Document doc, final DataInfo info) { - final List list = new ArrayList<>(); - for (final Object o : doc.selectNodes("//oaf:concept")) { - final String cid = ((Node) o).valueOf("@id"); - if (StringUtils.isNotBlank(cid)) { - final Context c = new Context(); - c.setId(cid); - c.setDataInfo(Arrays.asList(info)); - list.add(c); - } - } - return list; - } - - protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); - - protected abstract List prepareInstances( - Document doc, - DataInfo info, - KeyValue collectedfrom, - KeyValue hostedby); - - protected abstract List> prepareSources(Document doc, DataInfo info); - - protected abstract List prepareRelevantDates(Document doc, DataInfo info); - - protected abstract List> prepareCoverages(Document doc, DataInfo info); - - protected abstract List> prepareContributors(Document doc, DataInfo info); - - protected abstract List> prepareFormats(Document doc, DataInfo info); - - protected abstract Field preparePublisher(Document doc, DataInfo info); - - protected abstract List> prepareDescriptions(Document doc, DataInfo info); - - protected abstract List prepareTitles(Document doc, DataInfo info); - - protected abstract List prepareSubjects(Document doc, DataInfo info); - - protected abstract Qualifier prepareLanguages(Document doc); - - protected abstract List prepareAuthors(Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductTools( - Document doc, - DataInfo info); - - protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, - DataInfo info); - - protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, - DataInfo info); - - protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); - - protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); - - protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); - - protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, - DataInfo info); - - protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); - - protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); - - protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); - - protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); - - protected abstract Field prepareDatasetSize(Document doc, DataInfo info); - - protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); - - protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - - private Journal prepareJournal(final Document doc, final DataInfo info) { - final Node n = doc.selectSingleNode("//oaf:journal"); - if (n != null) { - final String name = n.getText(); - final String issnPrinted = n.valueOf("@issn"); - final String issnOnline = n.valueOf("@eissn"); - final String issnLinking = n.valueOf("@lissn"); - final String ep = n.valueOf("@ep"); - final String iss = n.valueOf("@iss"); - final String sp = n.valueOf("@sp"); - final String vol = n.valueOf("@vol"); - final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { - return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); - } - } - return null; - } - - private List findOriginalId(final Document doc) { - final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - if (n != null) { - final String id = n.valueOf("./*[local-name()='identifier']"); - if (StringUtils.isNotBlank(id)) { - return Lists.newArrayList(id); - } - } - final List idList = doc - .selectNodes( - "normalize-space(//*[local-name()='header']/*[local-name()='identifier' or local-name()='recordIdentifier']/text())"); - final Set originalIds = Sets.newHashSet(idList); - - if (originalIds.isEmpty()) { - throw new IllegalStateException("missing originalID on " + doc.asXML()); - } - return Lists.newArrayList(originalIds); - } - - protected AccessRight prepareAccessRight(final Node node, final String xpath, final String schemeId) { - final Qualifier qualifier = prepareQualifier(node.valueOf(xpath).trim(), schemeId); - final AccessRight accessRight = new AccessRight(); - accessRight.setClassid(qualifier.getClassid()); - accessRight.setClassname(qualifier.getClassname()); - accessRight.setSchemeid(qualifier.getSchemeid()); - accessRight.setSchemename(qualifier.getSchemename()); - - // TODO set the OAStatus - - return accessRight; - } - - protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) { - return prepareQualifier(node.valueOf(xpath).trim(), schemeId); - } - - protected Qualifier prepareQualifier(final String classId, final String schemeId) { - return vocs.getTermAsQualifier(schemeId, classId); - } - - protected List prepareListStructPropsWithValidQualifier( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final DataInfo info) { - final List res = new ArrayList<>(); - - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId).trim(); - if (vocs.termExists(schemeId, classId)) { - res.add(structuredProperty(n.getText(), vocs.getTermAsQualifier(schemeId, classId), info)); - } - } - return res; - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final Qualifier qualifier, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), qualifier, info)); - } - return res; - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res - .add( - structuredProperty( - n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), - n.valueOf("@schemename"), info)); - } - return res; - } - - protected List prepareSubjectList( - final Node node, - final String xpath, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res - .add( - subject( - n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), - n.valueOf("@schemename"), info)); - } - return res; - } - - protected OAIProvenance prepareOAIprovenance(final Document doc) { - final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - - if (n == null) { - return null; - } - - final String identifier = n.valueOf("./*[local-name()='identifier']"); - final String baseURL = n.valueOf("./*[local-name()='baseURL']"); - final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); - final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); - final String datestamp = n.valueOf("./*[local-name()='datestamp']"); - final String harvestDate = n.valueOf("@harvestDate"); - - return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); - } - - protected DataInfo prepareDataInfo(final Document doc, final boolean invisible) { - final Node n = doc.selectSingleNode("//oaf:datainfo"); - - if (n == null) { - return dataInfo(false, null, false, invisible, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); - } - - final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); - final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); - final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); - final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); - - final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); - final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); - final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); - final String trust = n.valueOf("./oaf:trust"); - - return dataInfo( - deletedbyinference, inferenceprovenance, inferred, invisible, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); - } - - protected Field prepareField(final Node node, final String xpath, final DataInfo info) { - return field(node.valueOf(xpath), info); - } - - protected List> prepareListFields( - final Node node, - final String xpath, - final DataInfo info) { - return listFields(info, prepareListString(node, xpath)); - } - - protected List prepareListString(final Node node, final String xpath) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final String s = ((Node) o).getText().trim(); - if (StringUtils.isNotBlank(s)) { - res.add(s); - } - } - return res; - } - - protected Set validateUrl(Collection url) { - UrlValidator urlValidator = UrlValidator.getInstance(); - if (Objects.isNull(url)) { - return new HashSet<>(); - } - return url - .stream() - .filter(u -> urlValidator.isValid(u)) - .collect(Collectors.toCollection(HashSet::new)); - } + protected final VocabularyGroup vocs; + + private final boolean invisible; + + private final boolean shouldHashId; + + private final boolean forceOriginalId; + + protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; + protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; + protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; + protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/"; + protected static final Qualifier ORCID_PID_TYPE = qualifier( + ModelConstants.ORCID_PENDING, + ModelConstants.ORCID_CLASSNAME, + DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier( + "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); + + protected static final String DEFAULT_TRUST_FOR_VALIDATED_RELS = "0.999"; + + protected static final Map nsContext = new HashMap<>(); + + private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class); + + static { + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); + } + + protected static final Set pidTypeWithAuthority = new HashSet<>(); + + static { + pidTypeWithAuthority.addAll(IdentifierFactory.PID_AUTHORITY.keySet().stream() + .map(PidType::toString) + .map(String::toLowerCase) + .collect(Collectors.toCollection(HashSet::new))); + } + + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, + final boolean shouldHashId, final boolean forceOriginalId) { + this.vocs = vocs; + this.invisible = invisible; + this.shouldHashId = shouldHashId; + this.forceOriginalId = forceOriginalId; + } + + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, + final boolean shouldHashId) { + this.vocs = vocs; + this.invisible = invisible; + this.shouldHashId = shouldHashId; + this.forceOriginalId = false; + } + + public List processMdRecord(final String xml) { + + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + try { + final Document doc = DocumentHelper + .parseText( + xml + .replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3) + .replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3) + .replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); + + final KeyValue collectedFrom = getProvenanceDatasource( + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + + if (collectedFrom == null) { + return Lists.newArrayList(); + } + + final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) + ? collectedFrom + : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); + + if (hostedBy == null) { + return Lists.newArrayList(); + } + + final DataInfo info = prepareDataInfo(doc, invisible); + final long lastUpdateTimestamp = new Date().getTime(); + + final List instances = prepareInstances(doc, info, collectedFrom, hostedBy); + + final String type = getResultType(doc, instances); + + return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); + } catch (DocumentException e) { + log.error("Error with record:\n" + xml); + return Lists.newArrayList(); + } + } + + protected String getResultType(final Document doc, final List instances) { + final String type = doc.valueOf("//dr:CobjCategory/@type"); + + if (StringUtils.isBlank(type) && vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { + final String instanceType = instances + .stream() + .map(i -> i.getInstancetype().getClassid()) + .findFirst() + .filter(s -> !UNKNOWN.equalsIgnoreCase(s)) + .orElse("0000"); // Unknown + return Optional + .ofNullable(vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) + .map(Qualifier::getClassid) + .orElse("0000"); + } + + return type; + } + + private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) { + final String dsId = doc.valueOf(xpathId); + final String dsName = doc.valueOf(xpathName); + + if (StringUtils.isBlank(dsId) || StringUtils.isBlank(dsName)) { + return null; + } + + return keyValue(createOpenaireId(10, dsId, true), dsName); + } + + protected List createOafs( + final Document doc, + final String type, + final List instances, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); + + final Set originalId = Sets.newHashSet(entity.getOriginalId()); + originalId.add(entity.getId()); + entity.setOriginalId(Lists.newArrayList(originalId)); + + if (!forceOriginalId) { + final String id = IdentifierFactory.createIdentifier(entity, shouldHashId); + if (!id.equals(entity.getId())) { + entity.setId(id); + } + } + + final List oafs = Lists.newArrayList(entity); + + if (!oafs.isEmpty()) { + Set rels = Sets.newHashSet(); + + rels.addAll(addProjectRels(doc, entity)); + rels.addAll(addOtherResultRels(doc, entity)); + rels.addAll(addRelations(doc, entity)); + + oafs.addAll(rels); + } + + return oafs; + } + + private OafEntity createEntity(final Document doc, + final String type, + final List instances, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + switch (type.toLowerCase()) { + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp); + p.setJournal(prepareJournal(doc, info)); + return p; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + return d; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + return s; + case "": + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + return o; + } + } + + private List addProjectRels( + final Document doc, + final OafEntity entity) { + + final List res = new ArrayList<>(); + + final String docId = entity.getId(); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + + final String originalId = ((Node) o).getText(); + + final String validationdDate = ((Node) o).valueOf("@validationDate"); + + if (StringUtils.isNotBlank(originalId)) { + final String projectId = createOpenaireId(40, originalId, true); + + res + .add( + OafMapperUtils + .getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate)); + res + .add( + OafMapperUtils + .getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate)); + } + } + + return res; + } + + private List addRelations(Document doc, OafEntity entity) { + + final List rels = Lists.newArrayList(); + + for (Object o : doc.selectNodes("//oaf:relation")) { + Element element = (Element) o; + + final String target = StringUtils.trim(element.getText()); + final String relType = element.attributeValue("relType"); + final String subRelType = element.attributeValue("subRelType"); + final String relClass = element.attributeValue("relClass"); + + if (StringUtils.isNotBlank(target) && StringUtils.isNotBlank(relType) && StringUtils.isNotBlank(subRelType) + && StringUtils.isNotBlank(relClass)) { + + final String relClassInverse = ModelSupport + .findInverse(ModelSupport.rel(relType, subRelType, relClass)) + .getInverseRelClass(); + final String validationdDate = ((Node) o).valueOf("@validationDate"); + + if (StringUtils.isNotBlank(target)) { + final String targetType = element.attributeValue("targetType"); + if (StringUtils.isNotBlank(targetType)) { + final String targetId = createOpenaireId(targetType, target, true); + rels + .add( + OafMapperUtils + .getRelation( + entity.getId(), targetId, relType, subRelType, relClass, entity, + validationdDate)); + rels + .add( + OafMapperUtils + .getRelation( + targetId, entity.getId(), relType, subRelType, relClassInverse, entity, + validationdDate)); + } + } + } + } + return rels; + } + + protected abstract List addOtherResultRels( + final Document doc, + final OafEntity entity); + + private void populateResultFields( + final Result r, + final Document doc, + final List instances, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); + r.setOriginalId(findOriginalId(doc)); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setPid(IdentifierFactory.getPids(prepareResultPids(doc, info), collectedFrom)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection/text()|//dri:dateOfCollection/text()")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation/text()|//dri:dateOfTransformation/text()")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); + r.setAuthor(prepareAuthors(doc, info)); + r.setLanguage(prepareLanguages(doc)); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareSubjects(doc, info)); + r.setTitle(prepareTitles(doc, info)); + r.setRelevantdate(prepareRelevantDates(doc, info)); + r.setDescription(prepareDescriptions(doc, info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(preparePublisher(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); + r.setSource(prepareSources(doc, info)); + r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info)); + r.setFormat(prepareFormats(doc, info)); + r.setContributor(prepareContributors(doc, info)); + r.setResourcetype(prepareResourceType(doc, info)); + r.setCoverage(prepareCoverages(doc, info)); + r.setContext(prepareContexts(doc, info)); + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r + .setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); + r + .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + + r.setInstance(instances); + r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances)); + } + + protected abstract List prepareResultPids(Document doc, DataInfo info); + + private List prepareContexts(final Document doc, final DataInfo info) { + final List list = new ArrayList<>(); + for (final Object o : doc.selectNodes("//oaf:concept")) { + final String cid = ((Node) o).valueOf("@id"); + if (StringUtils.isNotBlank(cid)) { + final Context c = new Context(); + c.setId(cid); + c.setDataInfo(Arrays.asList(info)); + list.add(c); + } + } + return list; + } + + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + + protected abstract List prepareInstances( + Document doc, + DataInfo info, + KeyValue collectedfrom, + KeyValue hostedby); + + protected abstract List> prepareSources(Document doc, DataInfo info); + + protected abstract List prepareRelevantDates(Document doc, DataInfo info); + + protected abstract List> prepareCoverages(Document doc, DataInfo info); + + protected abstract List> prepareContributors(Document doc, DataInfo info); + + protected abstract List> prepareFormats(Document doc, DataInfo info); + + protected abstract Field preparePublisher(Document doc, DataInfo info); + + protected abstract List> prepareDescriptions(Document doc, DataInfo info); + + protected abstract List prepareTitles(Document doc, DataInfo info); + + protected abstract List prepareSubjects(Document doc, DataInfo info); + + protected abstract Qualifier prepareLanguages(Document doc); + + protected abstract List prepareAuthors(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductTools( + Document doc, + DataInfo info); + + protected abstract List> prepareOtherResearchProductContactGroups( + Document doc, + DataInfo info); + + protected abstract List> prepareOtherResearchProductContactPersons( + Document doc, + DataInfo info); + + protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); + + protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); + + protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); + + protected abstract List> prepareSoftwareDocumentationUrls( + Document doc, + DataInfo info); + + protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); + + protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); + + protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); + + protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); + + protected abstract Field prepareDatasetSize(Document doc, DataInfo info); + + protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); + + protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + + private Journal prepareJournal(final Document doc, final DataInfo info) { + final Node n = doc.selectSingleNode("//oaf:journal"); + if (n != null) { + final String name = n.getText(); + final String issnPrinted = n.valueOf("@issn"); + final String issnOnline = n.valueOf("@eissn"); + final String issnLinking = n.valueOf("@lissn"); + final String ep = n.valueOf("@ep"); + final String iss = n.valueOf("@iss"); + final String sp = n.valueOf("@sp"); + final String vol = n.valueOf("@vol"); + final String edition = n.valueOf("@edition"); + if (StringUtils.isNotBlank(name)) { + return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); + } + } + return null; + } + + private List findOriginalId(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + if (n != null) { + final String id = n.valueOf("./*[local-name()='identifier']"); + if (StringUtils.isNotBlank(id)) { + return Lists.newArrayList(id); + } + } + final List idList = doc + .selectNodes( + "normalize-space(//*[local-name()='header']/*[local-name()='identifier' or local-name()='recordIdentifier']/text())"); + final Set originalIds = Sets.newHashSet(idList); + + if (originalIds.isEmpty()) { + throw new IllegalStateException("missing originalID on " + doc.asXML()); + } + return Lists.newArrayList(originalIds); + } + + protected AccessRight prepareAccessRight(final Node node, final String xpath, final String schemeId) { + final Qualifier qualifier = prepareQualifier(node.valueOf(xpath).trim(), schemeId); + final AccessRight accessRight = new AccessRight(); + accessRight.setClassid(qualifier.getClassid()); + accessRight.setClassname(qualifier.getClassname()); + accessRight.setSchemeid(qualifier.getSchemeid()); + accessRight.setSchemename(qualifier.getSchemename()); + + // TODO set the OAStatus + + return accessRight; + } + + protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) { + return prepareQualifier(node.valueOf(xpath).trim(), schemeId); + } + + protected Qualifier prepareQualifier(final String classId, final String schemeId) { + return vocs.getTermAsQualifier(schemeId, classId); + } + + protected List prepareListStructPropsWithValidQualifier( + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final DataInfo info) { + final List res = new ArrayList<>(); + + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId).trim(); + if (vocs.termExists(schemeId, classId)) { + res.add(structuredProperty(n.getText(), vocs.getTermAsQualifier(schemeId, classId), info)); + } + } + return res; + } + + protected List prepareListStructProps( + final Node node, + final String xpath, + final Qualifier qualifier, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, + final String xpath, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res + .add( + structuredProperty( + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); + } + return res; + } + + protected List prepareSubjectList( + final Node node, + final String xpath, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res + .add( + subject( + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); + } + return res; + } + + protected OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + + if (n == null) { + return null; + } + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']"); + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']"); + final String harvestDate = n.valueOf("@harvestDate"); + + return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } + + protected DataInfo prepareDataInfo(final Document doc, final boolean invisible) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + if (n == null) { + return dataInfo(false, null, false, invisible, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + } + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo( + deletedbyinference, inferenceprovenance, inferred, invisible, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + } + + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); + } + + protected List> prepareListFields( + final Node node, + final String xpath, + final DataInfo info) { + return listFields(info, prepareListString(node, xpath)); + } + + protected List prepareListString(final Node node, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } + + protected Set validateUrl(Collection url) { + UrlValidator urlValidator = UrlValidator.getInstance(); + if (Objects.isNull(url)) { + return new HashSet<>(); + } + return url + .stream() + .filter(u -> urlValidator.isValid(u)) + .collect(Collectors.toCollection(HashSet::new)); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index fc063a242c..304ec8f993 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -457,10 +457,12 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected String guessRelatedIdentifier(final String idType, final String value) { if (StringUtils.isBlank(idType) || StringUtils.isBlank(value)) return null; - if (idType.equalsIgnoreCase("OPENAIRE")) { - return createOpenaireId(50, value, false); - } else - return null; + if (idType.equalsIgnoreCase("OPENAIRE")) return createOpenaireId(50, value, false); + if(pidTypeWithAuthority.contains(idType.toLowerCase())){ + return IdentifierFactory.idFromPid("50", idType, value, true); + } + return null; + } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index fbfbf5af8a..32b23e42f8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -933,7 +933,7 @@ class MappersTest { System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println("***************"); - assertEquals(3, list.size()); + assertEquals(7, list.size()); final OtherResearchProduct p = (OtherResearchProduct) list.get(0); assertValidId(p.getId()); assertValidId(p.getCollectedfrom().get(0).getKey()); @@ -941,6 +941,21 @@ class MappersTest { assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertEquals("w3id", (p.getPid().get(0).getQualifier().getClassid())); assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", (p.getPid().get(0).getValue())); + + assertEquals(1, list.stream().filter(o -> o instanceof OtherResearchProduct).count()); + assertEquals(6, list.stream().filter(o -> o instanceof Relation).count()); + + for(Oaf oaf : list){ + if(oaf instanceof Relation){ + String source = ((Relation) oaf).getSource(); + String target = ((Relation) oaf).getTarget(); + assertNotEquals(source, target); + assertTrue(source.equals(p.getId()) || target.equals(p.getId())); + assertNotNull(((Relation) oaf).getSubRelType()); + assertNotNull( ((Relation) oaf).getRelClass()); + assertNotNull(((Relation) oaf).getRelType()); + } + } } @Test From fd63e9bfac41a4ec508dd9cb66880db46a1cb1b5 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Mon, 26 Sep 2022 11:24:13 +0200 Subject: [PATCH 4/5] Mapping all relationships supported in ModelConstants and ModelSupport --- .../raw/AbstractMdRecordToOafMapper.java | 14 ++-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 78 +++++++------------ .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 26 ++++++- .../dnetlib/dhp/oa/graph/raw/odf_dataset.xml | 1 - .../dnetlib/dhp/oa/graph/raw/odf_software.xml | 2 +- 5 files changed, 59 insertions(+), 62 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index bb5472a882..5f185444f2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -69,14 +69,16 @@ public abstract class AbstractMdRecordToOafMapper { nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); } - protected static final Set pidTypeWithAuthority = new HashSet<>(); + // lowercase pidTypes as keys, normal casing for the values + protected static final Map pidTypeWithAuthority = new HashMap<>(); static { - pidTypeWithAuthority.addAll(IdentifierFactory.PID_AUTHORITY.keySet().stream() - .map(PidType::toString) - .map(String::toLowerCase) - .collect(Collectors.toCollection(HashSet::new))); - } + IdentifierFactory.PID_AUTHORITY + .keySet() + .stream() + .forEach(entry -> pidTypeWithAuthority.put(entry.toString().toLowerCase(), entry.toString())); + + } protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId, final boolean forceOriginalId) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 304ec8f993..a25bcd47e8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -22,6 +22,8 @@ import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.common.RelationInverse; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; @@ -400,55 +402,12 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { if (StringUtils.isNotBlank(originalId)) { final String idType = ((Node) o).valueOf("@relatedIdentifierType"); - final String reltype = ((Node) o).valueOf("@relationType"); + final String relType = ((Node) o).valueOf("@relationType"); String otherId = guessRelatedIdentifier(idType, originalId); if (StringUtils.isNotBlank(otherId)) { - if (reltype.equalsIgnoreCase(IS_SUPPLEMENT_TO)) { - res - .add( - getRelation( - docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity)); - res - .add( - getRelation( - otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity)); - } else { - if (reltype.equalsIgnoreCase(IS_SUPPLEMENTED_BY)) { - res - .add( - getRelation( - otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity)); - res - .add( - getRelation( - docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity)); - } else { - if (reltype.equalsIgnoreCase(IS_PART_OF)) { - res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PART, IS_PART_OF, entity)); - res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PART, HAS_PART, entity)); - } else { - if (reltype.equalsIgnoreCase(HAS_PART)) { - res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PART, IS_PART_OF, entity)); - res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PART, HAS_PART, entity)); - } - // else TODO catch more semantics - } - } - } - + res.addAll(getRelations(relType, docId, otherId, entity)); } + } } return res; @@ -457,14 +416,33 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected String guessRelatedIdentifier(final String idType, final String value) { if (StringUtils.isBlank(idType) || StringUtils.isBlank(value)) return null; - if (idType.equalsIgnoreCase("OPENAIRE")) return createOpenaireId(50, value, false); - if(pidTypeWithAuthority.contains(idType.toLowerCase())){ - return IdentifierFactory.idFromPid("50", idType, value, true); - } + if (idType.equalsIgnoreCase("OPENAIRE")) + return createOpenaireId(50, value, false); + if (pidTypeWithAuthority.containsKey(idType.toLowerCase())) { + return IdentifierFactory.idFromPid("50", pidTypeWithAuthority.get(idType.toLowerCase()), value, true); + } return null; } + protected List getRelations(final String reltype, final String entityId, final String otherId, + final OafEntity entity) { + final List res = new ArrayList<>(); + RelationInverse rel = ModelSupport.findRelation(reltype); + if (rel != null) { + res + .add( + getRelation( + entityId, otherId, rel.getRelType(), rel.getSubReltype(), rel.getRelClass(), entity)); + res + .add( + getRelation( + otherId, entityId, rel.getRelType(), rel.getSubReltype(), rel.getInverseRelClass(), entity)); + + } + return res; + } + @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return prepareQualifier( diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 32b23e42f8..7552d1789c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -579,8 +579,10 @@ class MappersTest { final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); - assertEquals(1, list.size()); + assertEquals(3, list.size()); assertTrue(list.get(0) instanceof Software); + assertTrue(list.get(1) instanceof Relation); + assertTrue(list.get(2) instanceof Relation); final Software s = (Software) list.get(0); @@ -590,6 +592,22 @@ class MappersTest { assertTrue(s.getAuthor().size() > 0); assertTrue(s.getSubject().size() > 0); assertTrue(s.getInstance().size() > 0); + + final Relation r1 = (Relation) list.get(1); + final Relation r2 = (Relation) list.get(2); + + assertEquals(s.getId(), r1.getSource()); + assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r1.getTarget()); + assertEquals(ModelConstants.RESULT_RESULT, r1.getRelType()); + assertEquals(ModelConstants.RELATIONSHIP, r1.getSubRelType()); + assertEquals(ModelConstants.IS_REFERENCED_BY, r1.getRelClass()); + + assertEquals(s.getId(), r2.getTarget()); + assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r2.getSource()); + assertEquals(ModelConstants.RESULT_RESULT, r2.getRelType()); + assertEquals(ModelConstants.RELATIONSHIP, r2.getSubRelType()); + assertEquals(ModelConstants.REFERENCES, r2.getRelClass()); + } @Test @@ -945,14 +963,14 @@ class MappersTest { assertEquals(1, list.stream().filter(o -> o instanceof OtherResearchProduct).count()); assertEquals(6, list.stream().filter(o -> o instanceof Relation).count()); - for(Oaf oaf : list){ - if(oaf instanceof Relation){ + for (Oaf oaf : list) { + if (oaf instanceof Relation) { String source = ((Relation) oaf).getSource(); String target = ((Relation) oaf).getTarget(); assertNotEquals(source, target); assertTrue(source.equals(p.getId()) || target.equals(p.getId())); assertNotNull(((Relation) oaf).getSubRelType()); - assertNotNull( ((Relation) oaf).getRelClass()); + assertNotNull(((Relation) oaf).getRelClass()); assertNotNull(((Relation) oaf).getRelType()); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml index 4f41ee6eac..4633d62c39 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml @@ -69,7 +69,6 @@ - 10.5281/zenodo.3234525 https://zenodo.org/communities/epfl 1.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml index 6a9170ce17..387b1ee866 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_software.xml @@ -20,7 +20,7 @@ bio.tools http://maplab.imppc.org/chainy/ - 10.1093/bioinformatics/btw839 + 10.1093/bioinformatics/btw839 https://bio.tools/ From 3f90d159e3efc63529f5a51ae03c2410230869b4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 27 Sep 2022 15:08:00 +0200 Subject: [PATCH 5/5] code formatting --- .../raw/AbstractMdRecordToOafMapper.java | 1188 ++++++++--------- pom.xml | 2 +- 2 files changed, 595 insertions(+), 595 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5f185444f2..c157be51a0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -15,7 +15,6 @@ import java.net.URL; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.oaf.utils.PidType; import org.apache.commons.lang3.StringUtils; import org.apache.commons.validator.routines.UrlValidator; import org.dom4j.*; @@ -31,617 +30,618 @@ import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.schema.oaf.utils.PidType; public abstract class AbstractMdRecordToOafMapper { - protected final VocabularyGroup vocs; + protected final VocabularyGroup vocs; - private final boolean invisible; + private final boolean invisible; - private final boolean shouldHashId; + private final boolean shouldHashId; - private final boolean forceOriginalId; + private final boolean forceOriginalId; - protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; - protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; - protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; - protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/"; - protected static final Qualifier ORCID_PID_TYPE = qualifier( - ModelConstants.ORCID_PENDING, - ModelConstants.ORCID_CLASSNAME, - DNET_PID_TYPES, DNET_PID_TYPES); - protected static final Qualifier MAG_PID_TYPE = qualifier( - "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; + protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; + protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; + protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/"; + protected static final Qualifier ORCID_PID_TYPE = qualifier( + ModelConstants.ORCID_PENDING, + ModelConstants.ORCID_CLASSNAME, + DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier( + "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); - protected static final String DEFAULT_TRUST_FOR_VALIDATED_RELS = "0.999"; + protected static final String DEFAULT_TRUST_FOR_VALIDATED_RELS = "0.999"; - protected static final Map nsContext = new HashMap<>(); + protected static final Map nsContext = new HashMap<>(); - private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class); + private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class); - static { - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); - } + static { + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); + } // lowercase pidTypes as keys, normal casing for the values protected static final Map pidTypeWithAuthority = new HashMap<>(); - static { + static { IdentifierFactory.PID_AUTHORITY - .keySet() - .stream() - .forEach(entry -> pidTypeWithAuthority.put(entry.toString().toLowerCase(), entry.toString())); + .keySet() + .stream() + .forEach(entry -> pidTypeWithAuthority.put(entry.toString().toLowerCase(), entry.toString())); } - protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, - final boolean shouldHashId, final boolean forceOriginalId) { - this.vocs = vocs; - this.invisible = invisible; - this.shouldHashId = shouldHashId; - this.forceOriginalId = forceOriginalId; - } - - protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, - final boolean shouldHashId) { - this.vocs = vocs; - this.invisible = invisible; - this.shouldHashId = shouldHashId; - this.forceOriginalId = false; - } - - public List processMdRecord(final String xml) { - - DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); - try { - final Document doc = DocumentHelper - .parseText( - xml - .replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3) - .replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3) - .replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); - - final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - - if (collectedFrom == null) { - return Lists.newArrayList(); - } - - final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - - if (hostedBy == null) { - return Lists.newArrayList(); - } - - final DataInfo info = prepareDataInfo(doc, invisible); - final long lastUpdateTimestamp = new Date().getTime(); - - final List instances = prepareInstances(doc, info, collectedFrom, hostedBy); - - final String type = getResultType(doc, instances); - - return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); - } catch (DocumentException e) { - log.error("Error with record:\n" + xml); - return Lists.newArrayList(); - } - } - - protected String getResultType(final Document doc, final List instances) { - final String type = doc.valueOf("//dr:CobjCategory/@type"); - - if (StringUtils.isBlank(type) && vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { - final String instanceType = instances - .stream() - .map(i -> i.getInstancetype().getClassid()) - .findFirst() - .filter(s -> !UNKNOWN.equalsIgnoreCase(s)) - .orElse("0000"); // Unknown - return Optional - .ofNullable(vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) - .map(Qualifier::getClassid) - .orElse("0000"); - } - - return type; - } - - private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) { - final String dsId = doc.valueOf(xpathId); - final String dsName = doc.valueOf(xpathName); - - if (StringUtils.isBlank(dsId) || StringUtils.isBlank(dsName)) { - return null; - } - - return keyValue(createOpenaireId(10, dsId, true), dsName); - } - - protected List createOafs( - final Document doc, - final String type, - final List instances, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - - final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); - - final Set originalId = Sets.newHashSet(entity.getOriginalId()); - originalId.add(entity.getId()); - entity.setOriginalId(Lists.newArrayList(originalId)); - - if (!forceOriginalId) { - final String id = IdentifierFactory.createIdentifier(entity, shouldHashId); - if (!id.equals(entity.getId())) { - entity.setId(id); - } - } - - final List oafs = Lists.newArrayList(entity); - - if (!oafs.isEmpty()) { - Set rels = Sets.newHashSet(); - - rels.addAll(addProjectRels(doc, entity)); - rels.addAll(addOtherResultRels(doc, entity)); - rels.addAll(addRelations(doc, entity)); - - oafs.addAll(rels); - } - - return oafs; - } - - private OafEntity createEntity(final Document doc, - final String type, - final List instances, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - switch (type.toLowerCase()) { - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp); - p.setJournal(prepareJournal(doc, info)); - return p; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - return d; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - return s; - case "": - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - return o; - } - } - - private List addProjectRels( - final Document doc, - final OafEntity entity) { - - final List res = new ArrayList<>(); - - final String docId = entity.getId(); - - for (final Object o : doc.selectNodes("//oaf:projectid")) { - - final String originalId = ((Node) o).getText(); - - final String validationdDate = ((Node) o).valueOf("@validationDate"); - - if (StringUtils.isNotBlank(originalId)) { - final String projectId = createOpenaireId(40, originalId, true); - - res - .add( - OafMapperUtils - .getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate)); - res - .add( - OafMapperUtils - .getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate)); - } - } - - return res; - } - - private List addRelations(Document doc, OafEntity entity) { - - final List rels = Lists.newArrayList(); - - for (Object o : doc.selectNodes("//oaf:relation")) { - Element element = (Element) o; - - final String target = StringUtils.trim(element.getText()); - final String relType = element.attributeValue("relType"); - final String subRelType = element.attributeValue("subRelType"); - final String relClass = element.attributeValue("relClass"); - - if (StringUtils.isNotBlank(target) && StringUtils.isNotBlank(relType) && StringUtils.isNotBlank(subRelType) - && StringUtils.isNotBlank(relClass)) { - - final String relClassInverse = ModelSupport - .findInverse(ModelSupport.rel(relType, subRelType, relClass)) - .getInverseRelClass(); - final String validationdDate = ((Node) o).valueOf("@validationDate"); - - if (StringUtils.isNotBlank(target)) { - final String targetType = element.attributeValue("targetType"); - if (StringUtils.isNotBlank(targetType)) { - final String targetId = createOpenaireId(targetType, target, true); - rels - .add( - OafMapperUtils - .getRelation( - entity.getId(), targetId, relType, subRelType, relClass, entity, - validationdDate)); - rels - .add( - OafMapperUtils - .getRelation( - targetId, entity.getId(), relType, subRelType, relClassInverse, entity, - validationdDate)); - } - } - } - } - return rels; - } - - protected abstract List addOtherResultRels( - final Document doc, - final OafEntity entity); - - private void populateResultFields( - final Result r, - final Document doc, - final List instances, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); - r.setOriginalId(findOriginalId(doc)); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setPid(IdentifierFactory.getPids(prepareResultPids(doc, info), collectedFrom)); - r.setDateofcollection(doc.valueOf("//dr:dateOfCollection/text()|//dri:dateOfCollection/text()")); - r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation/text()|//dri:dateOfTransformation/text()")); - r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setOaiprovenance(prepareOAIprovenance(doc)); - r.setAuthor(prepareAuthors(doc, info)); - r.setLanguage(prepareLanguages(doc)); - r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setSubject(prepareSubjects(doc, info)); - r.setTitle(prepareTitles(doc, info)); - r.setRelevantdate(prepareRelevantDates(doc, info)); - r.setDescription(prepareDescriptions(doc, info)); - r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); - r.setPublisher(preparePublisher(doc, info)); - r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); - r.setSource(prepareSources(doc, info)); - r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info)); - r.setFormat(prepareFormats(doc, info)); - r.setContributor(prepareContributors(doc, info)); - r.setResourcetype(prepareResourceType(doc, info)); - r.setCoverage(prepareCoverages(doc, info)); - r.setContext(prepareContexts(doc, info)); - r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r - .setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); - r - .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - - r.setInstance(instances); - r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances)); - } - - protected abstract List prepareResultPids(Document doc, DataInfo info); - - private List prepareContexts(final Document doc, final DataInfo info) { - final List list = new ArrayList<>(); - for (final Object o : doc.selectNodes("//oaf:concept")) { - final String cid = ((Node) o).valueOf("@id"); - if (StringUtils.isNotBlank(cid)) { - final Context c = new Context(); - c.setId(cid); - c.setDataInfo(Arrays.asList(info)); - list.add(c); - } - } - return list; - } - - protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); - - protected abstract List prepareInstances( - Document doc, - DataInfo info, - KeyValue collectedfrom, - KeyValue hostedby); - - protected abstract List> prepareSources(Document doc, DataInfo info); - - protected abstract List prepareRelevantDates(Document doc, DataInfo info); - - protected abstract List> prepareCoverages(Document doc, DataInfo info); - - protected abstract List> prepareContributors(Document doc, DataInfo info); - - protected abstract List> prepareFormats(Document doc, DataInfo info); - - protected abstract Field preparePublisher(Document doc, DataInfo info); - - protected abstract List> prepareDescriptions(Document doc, DataInfo info); - - protected abstract List prepareTitles(Document doc, DataInfo info); - - protected abstract List prepareSubjects(Document doc, DataInfo info); - - protected abstract Qualifier prepareLanguages(Document doc); - - protected abstract List prepareAuthors(Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductTools( - Document doc, - DataInfo info); - - protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, - DataInfo info); - - protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, - DataInfo info); - - protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); - - protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); - - protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); - - protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, - DataInfo info); - - protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); - - protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); - - protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); - - protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); - - protected abstract Field prepareDatasetSize(Document doc, DataInfo info); - - protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); - - protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - - private Journal prepareJournal(final Document doc, final DataInfo info) { - final Node n = doc.selectSingleNode("//oaf:journal"); - if (n != null) { - final String name = n.getText(); - final String issnPrinted = n.valueOf("@issn"); - final String issnOnline = n.valueOf("@eissn"); - final String issnLinking = n.valueOf("@lissn"); - final String ep = n.valueOf("@ep"); - final String iss = n.valueOf("@iss"); - final String sp = n.valueOf("@sp"); - final String vol = n.valueOf("@vol"); - final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { - return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); - } - } - return null; - } - - private List findOriginalId(final Document doc) { - final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - if (n != null) { - final String id = n.valueOf("./*[local-name()='identifier']"); - if (StringUtils.isNotBlank(id)) { - return Lists.newArrayList(id); - } - } - final List idList = doc - .selectNodes( - "normalize-space(//*[local-name()='header']/*[local-name()='identifier' or local-name()='recordIdentifier']/text())"); - final Set originalIds = Sets.newHashSet(idList); - - if (originalIds.isEmpty()) { - throw new IllegalStateException("missing originalID on " + doc.asXML()); - } - return Lists.newArrayList(originalIds); - } - - protected AccessRight prepareAccessRight(final Node node, final String xpath, final String schemeId) { - final Qualifier qualifier = prepareQualifier(node.valueOf(xpath).trim(), schemeId); - final AccessRight accessRight = new AccessRight(); - accessRight.setClassid(qualifier.getClassid()); - accessRight.setClassname(qualifier.getClassname()); - accessRight.setSchemeid(qualifier.getSchemeid()); - accessRight.setSchemename(qualifier.getSchemename()); - - // TODO set the OAStatus - - return accessRight; - } - - protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) { - return prepareQualifier(node.valueOf(xpath).trim(), schemeId); - } - - protected Qualifier prepareQualifier(final String classId, final String schemeId) { - return vocs.getTermAsQualifier(schemeId, classId); - } - - protected List prepareListStructPropsWithValidQualifier( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final DataInfo info) { - final List res = new ArrayList<>(); - - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId).trim(); - if (vocs.termExists(schemeId, classId)) { - res.add(structuredProperty(n.getText(), vocs.getTermAsQualifier(schemeId, classId), info)); - } - } - return res; - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final Qualifier qualifier, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), qualifier, info)); - } - return res; - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res - .add( - structuredProperty( - n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), - n.valueOf("@schemename"), info)); - } - return res; - } - - protected List prepareSubjectList( - final Node node, - final String xpath, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res - .add( - subject( - n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), - n.valueOf("@schemename"), info)); - } - return res; - } - - protected OAIProvenance prepareOAIprovenance(final Document doc) { - final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - - if (n == null) { - return null; - } - - final String identifier = n.valueOf("./*[local-name()='identifier']"); - final String baseURL = n.valueOf("./*[local-name()='baseURL']"); - final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); - final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); - final String datestamp = n.valueOf("./*[local-name()='datestamp']"); - final String harvestDate = n.valueOf("@harvestDate"); - - return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); - } - - protected DataInfo prepareDataInfo(final Document doc, final boolean invisible) { - final Node n = doc.selectSingleNode("//oaf:datainfo"); - - if (n == null) { - return dataInfo(false, null, false, invisible, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); - } - - final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); - final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); - final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); - final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); - - final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); - final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); - final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); - final String trust = n.valueOf("./oaf:trust"); - - return dataInfo( - deletedbyinference, inferenceprovenance, inferred, invisible, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); - } - - protected Field prepareField(final Node node, final String xpath, final DataInfo info) { - return field(node.valueOf(xpath), info); - } - - protected List> prepareListFields( - final Node node, - final String xpath, - final DataInfo info) { - return listFields(info, prepareListString(node, xpath)); - } - - protected List prepareListString(final Node node, final String xpath) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final String s = ((Node) o).getText().trim(); - if (StringUtils.isNotBlank(s)) { - res.add(s); - } - } - return res; - } - - protected Set validateUrl(Collection url) { - UrlValidator urlValidator = UrlValidator.getInstance(); - if (Objects.isNull(url)) { - return new HashSet<>(); - } - return url - .stream() - .filter(u -> urlValidator.isValid(u)) - .collect(Collectors.toCollection(HashSet::new)); - } + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, + final boolean shouldHashId, final boolean forceOriginalId) { + this.vocs = vocs; + this.invisible = invisible; + this.shouldHashId = shouldHashId; + this.forceOriginalId = forceOriginalId; + } + + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible, + final boolean shouldHashId) { + this.vocs = vocs; + this.invisible = invisible; + this.shouldHashId = shouldHashId; + this.forceOriginalId = false; + } + + public List processMdRecord(final String xml) { + + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + try { + final Document doc = DocumentHelper + .parseText( + xml + .replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3) + .replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3) + .replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); + + final KeyValue collectedFrom = getProvenanceDatasource( + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + + if (collectedFrom == null) { + return Lists.newArrayList(); + } + + final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) + ? collectedFrom + : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); + + if (hostedBy == null) { + return Lists.newArrayList(); + } + + final DataInfo info = prepareDataInfo(doc, invisible); + final long lastUpdateTimestamp = new Date().getTime(); + + final List instances = prepareInstances(doc, info, collectedFrom, hostedBy); + + final String type = getResultType(doc, instances); + + return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); + } catch (DocumentException e) { + log.error("Error with record:\n" + xml); + return Lists.newArrayList(); + } + } + + protected String getResultType(final Document doc, final List instances) { + final String type = doc.valueOf("//dr:CobjCategory/@type"); + + if (StringUtils.isBlank(type) && vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { + final String instanceType = instances + .stream() + .map(i -> i.getInstancetype().getClassid()) + .findFirst() + .filter(s -> !UNKNOWN.equalsIgnoreCase(s)) + .orElse("0000"); // Unknown + return Optional + .ofNullable(vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) + .map(Qualifier::getClassid) + .orElse("0000"); + } + + return type; + } + + private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) { + final String dsId = doc.valueOf(xpathId); + final String dsName = doc.valueOf(xpathName); + + if (StringUtils.isBlank(dsId) || StringUtils.isBlank(dsName)) { + return null; + } + + return keyValue(createOpenaireId(10, dsId, true), dsName); + } + + protected List createOafs( + final Document doc, + final String type, + final List instances, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); + + final Set originalId = Sets.newHashSet(entity.getOriginalId()); + originalId.add(entity.getId()); + entity.setOriginalId(Lists.newArrayList(originalId)); + + if (!forceOriginalId) { + final String id = IdentifierFactory.createIdentifier(entity, shouldHashId); + if (!id.equals(entity.getId())) { + entity.setId(id); + } + } + + final List oafs = Lists.newArrayList(entity); + + if (!oafs.isEmpty()) { + Set rels = Sets.newHashSet(); + + rels.addAll(addProjectRels(doc, entity)); + rels.addAll(addOtherResultRels(doc, entity)); + rels.addAll(addRelations(doc, entity)); + + oafs.addAll(rels); + } + + return oafs; + } + + private OafEntity createEntity(final Document doc, + final String type, + final List instances, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + switch (type.toLowerCase()) { + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp); + p.setJournal(prepareJournal(doc, info)); + return p; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + return d; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + return s; + case "": + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + return o; + } + } + + private List addProjectRels( + final Document doc, + final OafEntity entity) { + + final List res = new ArrayList<>(); + + final String docId = entity.getId(); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + + final String originalId = ((Node) o).getText(); + + final String validationdDate = ((Node) o).valueOf("@validationDate"); + + if (StringUtils.isNotBlank(originalId)) { + final String projectId = createOpenaireId(40, originalId, true); + + res + .add( + OafMapperUtils + .getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate)); + res + .add( + OafMapperUtils + .getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate)); + } + } + + return res; + } + + private List addRelations(Document doc, OafEntity entity) { + + final List rels = Lists.newArrayList(); + + for (Object o : doc.selectNodes("//oaf:relation")) { + Element element = (Element) o; + + final String target = StringUtils.trim(element.getText()); + final String relType = element.attributeValue("relType"); + final String subRelType = element.attributeValue("subRelType"); + final String relClass = element.attributeValue("relClass"); + + if (StringUtils.isNotBlank(target) && StringUtils.isNotBlank(relType) && StringUtils.isNotBlank(subRelType) + && StringUtils.isNotBlank(relClass)) { + + final String relClassInverse = ModelSupport + .findInverse(ModelSupport.rel(relType, subRelType, relClass)) + .getInverseRelClass(); + final String validationdDate = ((Node) o).valueOf("@validationDate"); + + if (StringUtils.isNotBlank(target)) { + final String targetType = element.attributeValue("targetType"); + if (StringUtils.isNotBlank(targetType)) { + final String targetId = createOpenaireId(targetType, target, true); + rels + .add( + OafMapperUtils + .getRelation( + entity.getId(), targetId, relType, subRelType, relClass, entity, + validationdDate)); + rels + .add( + OafMapperUtils + .getRelation( + targetId, entity.getId(), relType, subRelType, relClassInverse, entity, + validationdDate)); + } + } + } + } + return rels; + } + + protected abstract List addOtherResultRels( + final Document doc, + final OafEntity entity); + + private void populateResultFields( + final Result r, + final Document doc, + final List instances, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); + r.setOriginalId(findOriginalId(doc)); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setPid(IdentifierFactory.getPids(prepareResultPids(doc, info), collectedFrom)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection/text()|//dri:dateOfCollection/text()")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation/text()|//dri:dateOfTransformation/text()")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); + r.setAuthor(prepareAuthors(doc, info)); + r.setLanguage(prepareLanguages(doc)); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareSubjects(doc, info)); + r.setTitle(prepareTitles(doc, info)); + r.setRelevantdate(prepareRelevantDates(doc, info)); + r.setDescription(prepareDescriptions(doc, info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(preparePublisher(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); + r.setSource(prepareSources(doc, info)); + r.setFulltext(prepareListFields(doc, "//oaf:fulltext", info)); + r.setFormat(prepareFormats(doc, info)); + r.setContributor(prepareContributors(doc, info)); + r.setResourcetype(prepareResourceType(doc, info)); + r.setCoverage(prepareCoverages(doc, info)); + r.setContext(prepareContexts(doc, info)); + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r + .setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); + r + .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + + r.setInstance(instances); + r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances)); + } + + protected abstract List prepareResultPids(Document doc, DataInfo info); + + private List prepareContexts(final Document doc, final DataInfo info) { + final List list = new ArrayList<>(); + for (final Object o : doc.selectNodes("//oaf:concept")) { + final String cid = ((Node) o).valueOf("@id"); + if (StringUtils.isNotBlank(cid)) { + final Context c = new Context(); + c.setId(cid); + c.setDataInfo(Arrays.asList(info)); + list.add(c); + } + } + return list; + } + + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + + protected abstract List prepareInstances( + Document doc, + DataInfo info, + KeyValue collectedfrom, + KeyValue hostedby); + + protected abstract List> prepareSources(Document doc, DataInfo info); + + protected abstract List prepareRelevantDates(Document doc, DataInfo info); + + protected abstract List> prepareCoverages(Document doc, DataInfo info); + + protected abstract List> prepareContributors(Document doc, DataInfo info); + + protected abstract List> prepareFormats(Document doc, DataInfo info); + + protected abstract Field preparePublisher(Document doc, DataInfo info); + + protected abstract List> prepareDescriptions(Document doc, DataInfo info); + + protected abstract List prepareTitles(Document doc, DataInfo info); + + protected abstract List prepareSubjects(Document doc, DataInfo info); + + protected abstract Qualifier prepareLanguages(Document doc); + + protected abstract List prepareAuthors(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductTools( + Document doc, + DataInfo info); + + protected abstract List> prepareOtherResearchProductContactGroups( + Document doc, + DataInfo info); + + protected abstract List> prepareOtherResearchProductContactPersons( + Document doc, + DataInfo info); + + protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); + + protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); + + protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); + + protected abstract List> prepareSoftwareDocumentationUrls( + Document doc, + DataInfo info); + + protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); + + protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); + + protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); + + protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); + + protected abstract Field prepareDatasetSize(Document doc, DataInfo info); + + protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); + + protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + + private Journal prepareJournal(final Document doc, final DataInfo info) { + final Node n = doc.selectSingleNode("//oaf:journal"); + if (n != null) { + final String name = n.getText(); + final String issnPrinted = n.valueOf("@issn"); + final String issnOnline = n.valueOf("@eissn"); + final String issnLinking = n.valueOf("@lissn"); + final String ep = n.valueOf("@ep"); + final String iss = n.valueOf("@iss"); + final String sp = n.valueOf("@sp"); + final String vol = n.valueOf("@vol"); + final String edition = n.valueOf("@edition"); + if (StringUtils.isNotBlank(name)) { + return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); + } + } + return null; + } + + private List findOriginalId(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + if (n != null) { + final String id = n.valueOf("./*[local-name()='identifier']"); + if (StringUtils.isNotBlank(id)) { + return Lists.newArrayList(id); + } + } + final List idList = doc + .selectNodes( + "normalize-space(//*[local-name()='header']/*[local-name()='identifier' or local-name()='recordIdentifier']/text())"); + final Set originalIds = Sets.newHashSet(idList); + + if (originalIds.isEmpty()) { + throw new IllegalStateException("missing originalID on " + doc.asXML()); + } + return Lists.newArrayList(originalIds); + } + + protected AccessRight prepareAccessRight(final Node node, final String xpath, final String schemeId) { + final Qualifier qualifier = prepareQualifier(node.valueOf(xpath).trim(), schemeId); + final AccessRight accessRight = new AccessRight(); + accessRight.setClassid(qualifier.getClassid()); + accessRight.setClassname(qualifier.getClassname()); + accessRight.setSchemeid(qualifier.getSchemeid()); + accessRight.setSchemename(qualifier.getSchemename()); + + // TODO set the OAStatus + + return accessRight; + } + + protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) { + return prepareQualifier(node.valueOf(xpath).trim(), schemeId); + } + + protected Qualifier prepareQualifier(final String classId, final String schemeId) { + return vocs.getTermAsQualifier(schemeId, classId); + } + + protected List prepareListStructPropsWithValidQualifier( + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final DataInfo info) { + final List res = new ArrayList<>(); + + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId).trim(); + if (vocs.termExists(schemeId, classId)) { + res.add(structuredProperty(n.getText(), vocs.getTermAsQualifier(schemeId, classId), info)); + } + } + return res; + } + + protected List prepareListStructProps( + final Node node, + final String xpath, + final Qualifier qualifier, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, + final String xpath, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res + .add( + structuredProperty( + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); + } + return res; + } + + protected List prepareSubjectList( + final Node node, + final String xpath, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res + .add( + subject( + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); + } + return res; + } + + protected OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + + if (n == null) { + return null; + } + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']"); + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']"); + final String harvestDate = n.valueOf("@harvestDate"); + + return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } + + protected DataInfo prepareDataInfo(final Document doc, final boolean invisible) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + if (n == null) { + return dataInfo(false, null, false, invisible, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + } + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo( + deletedbyinference, inferenceprovenance, inferred, invisible, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + } + + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); + } + + protected List> prepareListFields( + final Node node, + final String xpath, + final DataInfo info) { + return listFields(info, prepareListString(node, xpath)); + } + + protected List prepareListString(final Node node, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } + + protected Set validateUrl(Collection url) { + UrlValidator urlValidator = UrlValidator.getInstance(); + if (Objects.isNull(url)) { + return new HashSet<>(); + } + return url + .stream() + .filter(u -> urlValidator.isValid(u)) + .collect(Collectors.toCollection(HashSet::new)); + } } diff --git a/pom.xml b/pom.xml index 55265bf555..9b60b90789 100644 --- a/pom.xml +++ b/pom.xml @@ -807,7 +807,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [3.14.0-SNAPSHOT] + [3.15.0] [4.0.3] [6.0.5] [3.1.6]