forked from antonis.lempesis/dnet-hadoop
...
This commit is contained in:
parent
bb1533a07e
commit
181e8498d4
|
@ -60,12 +60,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
final Map<String, String> nsContext = new HashMap<>();
|
||||
|
||||
registerNamespaces(nsContext);
|
||||
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||
|
||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||
}
|
||||
|
||||
|
@ -107,7 +102,13 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
}
|
||||
}
|
||||
|
||||
protected abstract void registerNamespaces(Map<String, String> nsContext);
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||
}
|
||||
|
||||
protected List<Oaf> createOafs(final Document doc,
|
||||
final String type,
|
||||
|
@ -196,7 +197,7 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
r.setDescription(prepareDescriptions(doc, info));
|
||||
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
|
||||
r.setPublisher(preparePublisher(doc, info));
|
||||
r.setEmbargoenddate(prepareEmbargoEndDate(doc, info));
|
||||
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
||||
r.setSource(prepareSources(doc, info));
|
||||
r.setFulltext(null); // NOT PRESENT IN MDSTORES
|
||||
r.setFormat(prepareFormats(doc, info));
|
||||
|
@ -215,8 +216,6 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
|
||||
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareEmbargoEndDate(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
|
||||
|
@ -289,20 +288,20 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
return null;
|
||||
}
|
||||
|
||||
protected Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) {
|
||||
final String classId = doc.valueOf(xpath);
|
||||
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) {
|
||||
final String classId = node.valueOf(xpath);
|
||||
final String className = code2name.get(classId);
|
||||
return qualifier(classId, className, schemeId, schemeName);
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Document doc,
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node,
|
||||
final String xpath,
|
||||
final String xpathClassId,
|
||||
final String schemeId,
|
||||
final String schemeName,
|
||||
final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes(xpath)) {
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
final String classId = n.valueOf(xpathClassId);
|
||||
final String className = code2name.get(classId);
|
||||
|
@ -311,18 +310,18 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) {
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes(xpath)) {
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
res.add(structuredProperty(n.getText(), qualifier, info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final DataInfo info) {
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes(xpath)) {
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
|
||||
.valueOf("@schemename"), info));
|
||||
|
@ -359,17 +358,17 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
|||
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
||||
}
|
||||
|
||||
protected Field<String> prepareField(final Document doc, final String xpath, final DataInfo info) {
|
||||
return field(doc.valueOf(xpath), info);
|
||||
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
|
||||
return field(node.valueOf(xpath), info);
|
||||
}
|
||||
|
||||
protected List<Field<String>> prepareListFields(final Document doc, final String xpath, final DataInfo info) {
|
||||
return listFields(info, (String[]) prepareListString(doc, xpath).toArray());
|
||||
protected List<Field<String>> prepareListFields(final Node node, final String xpath, final DataInfo info) {
|
||||
return listFields(info, (String[]) prepareListString(node, xpath).toArray());
|
||||
}
|
||||
|
||||
protected List<String> prepareListString(final Document doc, final String xpath) {
|
||||
protected List<String> prepareListString(final Node node, final String xpath) {
|
||||
final List<String> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes(xpath)) {
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final String s = ((Node) o).getText().trim();
|
||||
if (StringUtils.isNotBlank(s)) {
|
||||
res.add(s);
|
||||
|
|
|
@ -31,8 +31,11 @@ public class MigrateMongoMdstoresApplication {
|
|||
new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
|
||||
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
|
||||
}
|
||||
} else if (mdFormat.equalsIgnoreCase("oaf")) {
|
||||
|
||||
} else if (mdFormat.equalsIgnoreCase("odf")) {
|
||||
try (final OdfMigrationExecutor mig =
|
||||
new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
|
||||
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException("Format not supported: " + mdFormat);
|
||||
}
|
||||
|
|
|
@ -33,12 +33,8 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
@Override
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
super.registerNamespaces(nsContext);
|
||||
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -144,12 +140,6 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
|
|||
return prepareListFields(doc, "//dc:source", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareEmbargoEndDate(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
|
@ -29,134 +32,160 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
@Override
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareEmbargoEndDate(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
nsContext.put("dc", "http://datacite.org/schema/kernel-3");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareLanguages(final Document doc) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
final List<Author> res = new ArrayList<>();
|
||||
int pos = 1;
|
||||
for (final Object o : doc.selectNodes("//dc:creator")) {
|
||||
final Node n = (Node) o;
|
||||
final Author author = new Author();
|
||||
author.setFullname(n.valueOf("./dc:creatorName"));
|
||||
author.setName(n.valueOf("./dc:givenName"));
|
||||
author.setSurname(n.valueOf("./dc:familyName"));
|
||||
author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info));
|
||||
author.setPid(preparePids(doc, info));
|
||||
author.setRank(pos++);
|
||||
res.add(author);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("./dc:nameIdentifier")) {
|
||||
res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
|
||||
final List<Instance> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
||||
final Instance instance = new Instance();
|
||||
instance.setUrl(((Node) o).getText().trim());
|
||||
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
res.add(instance);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:date")) {
|
||||
final String dateType = ((Node) o).valueOf("@dateType");
|
||||
if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
|
||||
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
|
||||
res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:format", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:publisher", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:subject", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareLanguages(final Document doc) {
|
||||
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return null; // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
|
||||
}
|
||||
|
||||
// DATASETS
|
||||
|
||||
@Override
|
||||
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
final List<GeoLocation> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//dc:geoLocation")) {
|
||||
final GeoLocation loc = new GeoLocation();
|
||||
loc.setBox(((Node) o).valueOf("./dc:geoLocationBox"));
|
||||
loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace"));
|
||||
loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint"));
|
||||
res.add(loc);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -167,32 +196,27 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
|||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareField(doc, "//dc:date[@dateType='Updated']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareField(doc, "//dc:version", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareField(doc, "//dc:size", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return null; // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
return prepareField(doc, "//dc:date[@dateType='Issued']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue