From 181e8498d4bfa1f848048715c2231bc3b6613907 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Fri, 7 Feb 2020 16:02:49 +0100 Subject: [PATCH] ... --- .../dhp/migration/AbstractMongoExecutor.java | 47 ++-- .../MigrateMongoMdstoresApplication.java | 7 +- .../dhp/migration/OafMigrationExecutor.java | 12 +- .../dhp/migration/OdfMigrationExecutor.java | 220 ++++++++++-------- 4 files changed, 151 insertions(+), 135 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java index cf1581b4d..1fa70dded 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -60,12 +60,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { final Map nsContext = new HashMap<>(); registerNamespaces(nsContext); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); } @@ -107,7 +102,13 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { } } - protected abstract void registerNamespaces(Map nsContext); + protected void registerNamespaces(final Map nsContext) { + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + } protected List createOafs(final Document doc, final String type, @@ -196,7 +197,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { r.setDescription(prepareDescriptions(doc, info)); r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); r.setPublisher(preparePublisher(doc, info)); - r.setEmbargoenddate(prepareEmbargoEndDate(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setSource(prepareSources(doc, info)); r.setFulltext(null); // NOT PRESENT IN MDSTORES r.setFormat(prepareFormats(doc, info)); @@ -215,8 +216,6 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected abstract List> prepareSources(Document doc, DataInfo info); - protected abstract Field prepareEmbargoEndDate(Document doc, DataInfo info); - protected abstract List prepareRelevantDates(Document doc, DataInfo info); protected abstract List> prepareCoverages(Document doc, DataInfo info); @@ -289,20 +288,20 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { return null; } - protected Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) { - final String classId = doc.valueOf(xpath); + protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) { + final String classId = node.valueOf(xpath); final String className = code2name.get(classId); return qualifier(classId, className, schemeId, schemeName); } - protected List prepareListStructProps(final Document doc, + protected List prepareListStructProps(final Node node, final String xpath, final String xpathClassId, final String schemeId, final String schemeName, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { + for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; final String classId = n.valueOf(xpathClassId); final String className = code2name.get(classId); @@ -311,18 +310,18 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { return res; } - protected List prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) { + protected List prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { + for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res.add(structuredProperty(n.getText(), qualifier, info)); } return res; } - protected List prepareListStructProps(final Document doc, final String xpath, final DataInfo info) { + protected List prepareListStructProps(final Node node, final String xpath, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { + for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n .valueOf("@schemename"), info)); @@ -359,17 +358,17 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } - protected Field prepareField(final Document doc, final String xpath, final DataInfo info) { - return field(doc.valueOf(xpath), info); + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); } - protected List> prepareListFields(final Document doc, final String xpath, final DataInfo info) { - return listFields(info, (String[]) prepareListString(doc, xpath).toArray()); + protected List> prepareListFields(final Node node, final String xpath, final DataInfo info) { + return listFields(info, (String[]) prepareListString(node, xpath).toArray()); } - protected List prepareListString(final Document doc, final String xpath) { + protected List prepareListString(final Node node, final String xpath) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes(xpath)) { + for (final Object o : node.selectNodes(xpath)) { final String s = ((Node) o).getText().trim(); if (StringUtils.isNotBlank(s)) { res.add(s); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java index 124a4f3cc..359fe7596 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java @@ -31,8 +31,11 @@ public class MigrateMongoMdstoresApplication { new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); } - } else if (mdFormat.equalsIgnoreCase("oaf")) { - + } else if (mdFormat.equalsIgnoreCase("odf")) { + try (final OdfMigrationExecutor mig = + new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { + mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); + } } else { throw new RuntimeException("Format not supported: " + mdFormat); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java index f46b31732..6dcfae71f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java @@ -33,12 +33,8 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { @Override protected void registerNamespaces(final Map nsContext) { + super.registerNamespaces(nsContext); nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); } @Override @@ -144,12 +140,6 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { return prepareListFields(doc, "//dc:source", info); } - @Override - protected Field prepareEmbargoEndDate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - @Override protected List prepareRelevantDates(final Document doc, final DataInfo info) { // TODO Auto-generated method stub diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java index bb0932883..5e9c70ae5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java @@ -1,11 +1,14 @@ package eu.dnetlib.dhp.migration; +import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Document; +import org.dom4j.Node; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -29,134 +32,160 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected void registerNamespaces(final Map nsContext) { - // TODO Auto-generated method stub - - } - - @Override - protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareSources(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected Field prepareEmbargoEndDate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List prepareRelevantDates(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareCoverages(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareContributors(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareFormats(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected Field preparePublisher(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List> prepareDescriptions(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + nsContext.put("dc", "http://datacite.org/schema/kernel-3"); } @Override protected List prepareTitles(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; - } - - @Override - protected Qualifier prepareLanguages(final Document doc) { - // TODO Auto-generated method stub - return null; + return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); } @Override protected List prepareAuthors(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//dc:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.valueOf("./dc:creatorName")); + author.setName(n.valueOf("./dc:givenName")); + author.setSurname(n.valueOf("./dc:familyName")); + author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info)); + author.setPid(preparePids(doc, info)); + author.setRank(pos++); + res.add(author); + } + return res; + } + + private List preparePids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("./dc:nameIdentifier")) { + res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info)); + } + return res; + } + + @Override + protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) { + final Instance instance = new Instance(); + instance.setUrl(((Node) o).getText().trim()); + instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + res.add(instance); + } + return res; + } + + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } + + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:date")) { + final String dateType = ((Node) o).valueOf("@dateType"); + if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued") + && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { + res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info)); + } + } + return res; + } + + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } + + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:contributorName", info); + } + + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:format", info); + } + + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//dc:publisher", info); + } + + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info); + } + + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:subject", info); + } + + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); } @Override protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info); } @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages"); } @Override protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // Not present in ODF ??? } @Override protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } + // DATASETS + @Override protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//dc:geoLocation")) { + final GeoLocation loc = new GeoLocation(); + loc.setBox(((Node) o).valueOf("./dc:geoLocationBox")); + loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace")); + loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint")); + res.add(loc); + } + return res; } @Override @@ -167,32 +196,27 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareField(doc, "//dc:date[@dateType='Updated']", info); } @Override protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareField(doc, "//dc:version", info); } @Override protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareField(doc, "//dc:size", info); } @Override protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // Not present in ODF ??? } @Override protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return prepareField(doc, "//dc:date[@dateType='Issued']", info); } @Override