partial implementation of migration

This commit is contained in:
Michele Artini 2020-02-04 15:25:47 +01:00
parent 6bfe2dc96e
commit fbb0fc140b
6 changed files with 626 additions and 488 deletions

View File

@ -30,7 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
public class AbstractMigrateApplication implements Closeable { public class AbstractMigrationExecutor implements Closeable {
private final AtomicInteger counter = new AtomicInteger(0); private final AtomicInteger counter = new AtomicInteger(0);
@ -42,7 +42,7 @@ public class AbstractMigrateApplication implements Closeable {
private final SequenceFile.Writer writer; private final SequenceFile.Writer writer;
public AbstractMigrateApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception { public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception {
this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class)); .keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class));
} }

View File

@ -0,0 +1,369 @@
package eu.dnetlib.dhp.migration;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
protected final Map<String, String> code2name = new HashMap<>();
protected final MdstoreClient mdstoreClient;
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
final String mongoDb, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser);
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
loadClassNames(dbUrl, dbUser, dbPassword);
final Map<String, String> nsContext = new HashMap<>();
registerNamespaces(nsContext);
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
}
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
code2name.clear();
dbClient.processResults("select code, name from class", rs -> {
try {
code2name.put(rs.getString("code"), rs.getString("name"));
} catch (final SQLException e) {
e.printStackTrace();
}
});
}
}
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
for (final Entry<String, String> entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) {
// final String mdId = entry.getKey();
final String currentColl = entry.getValue();
for (final String xml : mdstoreClient.listRecords(currentColl)) {
final Document doc = DocumentHelper.parseText(xml);
final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
final DataInfo info = prepareDataInfo(doc);
final long lastUpdateTimestamp = new Date().getTime();
for (final Oaf oaf : createOafs(doc, type, collectedFrom, info, lastUpdateTimestamp)) {
emitOaf(oaf);
}
}
}
}
protected abstract void registerNamespaces(Map<String, String> nsContext);
protected List<Oaf> createOafs(final Document doc, final String type, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
final List<Oaf> oafs = new ArrayList<>();
switch (type.toLowerCase()) {
case "":
case "publication":
final Publication p = new Publication();
populateResultFields(p, doc, collectedFrom, info, lastUpdateTimestamp);
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
p.setJournal(prepareJournal(doc, info));
oafs.add(p);
break;
case "dataset":
final Dataset d = new Dataset();
populateResultFields(d, doc, collectedFrom, info, lastUpdateTimestamp);
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
d.setStoragedate(prepareDatasetStorageDate(doc, info));
d.setDevice(prepareDatasetDevice(doc, info));
d.setSize(prepareDatasetSize(doc, info));
d.setVersion(prepareDatasetVersion(doc, info));
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
oafs.add(d);
break;
case "software":
final Software s = new Software();
populateResultFields(s, doc, collectedFrom, info, lastUpdateTimestamp);
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
s.setLicense(prepareSoftwareLicenses(doc, info));
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
oafs.add(s);
break;
case "otherresearchproducts":
default:
final OtherResearchProduct o = new OtherResearchProduct();
populateResultFields(o, doc, collectedFrom, info, lastUpdateTimestamp);
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
o.setTool(prepareOtherResearchProductTools(doc, info));
oafs.add(o);
break;
}
if (!oafs.isEmpty()) {
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
}
return oafs;
}
private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier")));
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(prepareAuthors(doc, info));
r.setLanguage(prepareLanguages(doc));
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setSubject(prepareSubjects(doc, info));
r.setTitle(prepareTitles(doc, info));
r.setRelevantdate(prepareRelevantDates(doc, info));
r.setDescription(prepareDescriptions(doc, info));
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareEmbargoEndDate(doc, info));
r.setSource(prepareSources(doc, info));
r.setFulltext(null); // NOT PRESENT IN MDSTORES
r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info));
r.setResourcetype(null); // TODO
r.setCoverage(prepareCoverages(doc, info));
r.setRefereed(null); // TODO
r.setContext(null); // TODO
r.setExternalReference(null); // TODO
r.setInstance(prepareInstances(doc, info));
r.setProcessingchargeamount(null); // TODO
r.setProcessingchargecurrency(null); // TODO
}
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
protected abstract Field<String> prepareEmbargoEndDate(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info);
protected abstract Field<String> preparePublisher(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
protected abstract Qualifier prepareLanguages(Document doc);
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductTools(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(Document doc, DataInfo info);
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(Document doc, DataInfo info);
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
abstract protected void addRelations(final List<Oaf> oafs,
final Document doc,
final String type,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp);
private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) {
final String name = n.getText();
final String issnPrinted = n.valueOf("@issn");
final String issnOnline = n.valueOf("@eissn");
final String issnLinking = n.valueOf("@lissn");
if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, null, null, null, null, null, null, null, info); }
}
return null;
}
protected Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) {
final String classId = doc.valueOf(xpath);
final String className = code2name.get(classId);
return qualifier(classId, className, schemeId, schemeName);
}
protected List<StructuredProperty> prepareListStructProps(final Document doc,
final String xpath,
final String xpathClassId,
final String schemeId,
final String schemeName,
final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes(xpath)) {
final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId);
final String className = code2name.get(classId);
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
}
return res;
}
protected List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), qualifier, info));
}
return res;
}
protected List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
.valueOf("@schemename"), info));
}
return res;
}
protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
final String identifier = n.valueOf("./*[local-name()='identifier']");
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
final String harvestDate = n.valueOf("@harvestDate");;
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
}
protected DataInfo prepareDataInfo(final Document doc) {
final Node n = doc.selectSingleNode("//oaf:datainfo");
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
final String trust = n.valueOf("./oaf:trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
}
protected Field<String> prepareField(final Document doc, final String xpath, final DataInfo info) {
return field(doc.valueOf(xpath), info);
}
protected List<Field<String>> prepareListFields(final Document doc, final String xpath, final DataInfo info) {
return listFields(info, (String[]) prepareListString(doc, xpath).toArray());
}
protected List<String> prepareListString(final Document doc, final String xpath) {
final List<String> res = new ArrayList<>();
for (final Object o : doc.selectNodes(xpath)) {
final String s = ((Node) o).getText().trim();
if (StringUtils.isNotBlank(s)) {
res.add(s);
}
}
return res;
}
@Override
public void close() throws IOException {
super.close();
mdstoreClient.close();
}
}

View File

@ -28,7 +28,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class MigrateDbEntitiesApplication extends AbstractMigrateApplication implements Closeable { public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable {
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");

View File

@ -1,56 +1,10 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication implements Closeable { public class MigrateMongoMdstoresApplication {
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
private final Map<String, String> code2name = new HashMap<>();
private final MdstoreClient mdstoreClient;
private static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
private static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
private static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
private static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
private static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -72,294 +26,17 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication
final String dbUser = parser.get("postgresUser"); final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword"); final String dbPassword = parser.get("postgresPassword");
try (final MigrateMongoMdstoresApplication mig = if (mdFormat.equalsIgnoreCase("oaf")) {
new MigrateMongoMdstoresApplication(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) { try (final OafMigrationExecutor mig =
new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
} }
} else if (mdFormat.equalsIgnoreCase("oaf")) {
} } else {
throw new RuntimeException("Format not supported: " + mdFormat);
public MigrateMongoMdstoresApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
final String mongoDb, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser);
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
loadClassNames(dbUrl, dbUser, dbPassword);
final Map<String, String> nsContext = new HashMap<>();
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
}
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
code2name.clear();
dbClient.processResults("select code, name from class", rs -> {
try {
code2name.put(rs.getString("code"), rs.getString("name"));
} catch (final SQLException e) {
e.printStackTrace();
}
});
} }
} }
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
for (final Entry<String, String> entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) {
// final String mdId = entry.getKey();
final String currentColl = entry.getValue();
for (final String xml : mdstoreClient.listRecords(currentColl)) {
for (final Oaf oaf : createOafs(xml)) {
emitOaf(oaf);
}
}
}
}
private List<Oaf> createOafs(final String xml) throws DocumentException {
final Document doc = DocumentHelper.parseText(xml);
final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
final DataInfo info = prepareDataInfo(doc);
final long lastUpdateTimestamp = new Date().getTime();
final List<Oaf> oafs = new ArrayList<>();
switch (type.toLowerCase()) {
case "":
case "publication":
final Publication p = new Publication();
populateResultFields(p, doc, collectedFrom, info, lastUpdateTimestamp);
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
p.setJournal(null); // TODO
oafs.add(p);
break;
case "dataset":
final Dataset d = new Dataset();
populateResultFields(d, doc, collectedFrom, info, lastUpdateTimestamp);
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
d.setStoragedate(null); // TODO
d.setDevice(null); // TODO
d.setSize(null); // TODO
d.setVersion(null); // TODO
d.setLastmetadataupdate(null); // TODO
d.setMetadataversionnumber(null); // TODO
d.setGeolocation(null); // TODO
oafs.add(d);
break;
case "otherresearchproducts":
case "software":
final Software s = new Software();
populateResultFields(s, doc, collectedFrom, info, lastUpdateTimestamp);
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
s.setDocumentationUrl(null); // TODO
s.setLicense(null); // TODO
s.setCodeRepositoryUrl(null); // TODO
s.setProgrammingLanguage(null); // TODO
oafs.add(s);
break;
default:
final OtherResearchProduct o = new OtherResearchProduct();
populateResultFields(o, doc, collectedFrom, info, lastUpdateTimestamp);
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
o.setContactperson(null); // TODO
o.setContactgroup(null); // TODO
o.setTool(null); // TODO
oafs.add(o);
break;
}
if (!oafs.isEmpty()) {
addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
}
return oafs;
}
private void addRelations(final List<Oaf> oafs,
final Document doc,
final String xpath,
final String type,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
for (final Object o : doc.selectNodes(xpath)) {
final Node n = (Node) o;
final Relation r = new Relation();
r.setRelType(null); // TODO
r.setSubRelType(null); // TODO
r.setRelClass(null); // TODO
r.setSource(null); // TODO
r.setTarget(null); // TODO
r.setCollectedFrom(Arrays.asList(collectedFrom));
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
oafs.add(r);
}
}
private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier")));
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(null); // TODO
r.setLanguage(prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"));
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setSubject(prepareListStructProps(doc, "//dc:subject", info));
r.setTitle(prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info));
r.setRelevantdate(null); // TODO
r.setDescription(prepareListFields(doc, "//dc:description", info));
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
r.setPublisher(prepareField(doc, "//dc:publisher", info));
r.setEmbargoenddate(null); // TODO
r.setSource(null); // TODO
r.setFulltext(null); // TODO
r.setFormat(prepareListFields(doc, "//dc:format", info));
r.setContributor(prepareListFields(doc, "//dc:contributor", info));
r.setResourcetype(null); // TODO
r.setCoverage(prepareListFields(doc, "//dc:coverage", info));
r.setRefereed(null); // TODO
r.setContext(null); // TODO
r.setExternalReference(null); // TODO
r.setInstance(null); // TODO
r.setProcessingchargeamount(null); // TODO
r.setProcessingchargecurrency(null); // TODO
}
private Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) {
final String classId = doc.valueOf(xpath);
final String className = code2name.get(classId);
return qualifier(classId, className, schemeId, schemeName);
}
private List<StructuredProperty> prepareListStructProps(final Document doc,
final String xpath,
final String xpathClassId,
final String schemeId,
final String schemeName,
final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes(xpath)) {
final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId);
final String className = code2name.get(classId);
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
}
return res;
}
private List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), qualifier, info));
}
return res;
}
private List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
.valueOf("@schemename"), info));
}
return res;
}
private OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
final String identifier = n.valueOf("./*[local-name()='identifier']");
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
final String harvestDate = n.valueOf("@harvestDate");;
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
}
private DataInfo prepareDataInfo(final Document doc) {
final Node n = doc.selectSingleNode("//oaf:datainfo");
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
final String trust = n.valueOf("./oaf:trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
}
private Field<String> prepareField(final Document doc, final String xpath, final DataInfo info) {
return field(doc.valueOf(xpath), info);
}
private List<Field<String>> prepareListFields(final Document doc, final String xpath, final DataInfo info) {
return listFields(info, (String[]) prepareListString(doc, xpath).toArray());
}
private List<String> prepareListString(final Document doc, final String xpath) {
final List<String> res = new ArrayList<>();
for (final Object o : doc.selectNodes(xpath)) {
final String s = ((Node) o).getText().trim();
if (StringUtils.isNotBlank(s)) {
res.add(s);
}
}
return res;
}
/*
* private StructuredProperty prepareStructProp(final Document doc, final String xpath, final DataInfo dataInfo) { if
* (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); if (parts.length == 2) { final String value =
* parts[0]; final String[] arr = parts[1].split("@@@"); if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2],
* arr[3], dataInfo); } } return null; }
*
* private List<StructuredProperty> prepareListOfStructProps(final Document doc, final String xpath, final DataInfo dataInfo) { final
* List<StructuredProperty> res = new ArrayList<>(); if (array != null) { for (final String s : (String[]) array.getArray()) { final
* StructuredProperty sp = prepareStructProp(s, dataInfo); if (sp != null) { res.add(sp); } } }
*
* return res; }
*
* private Journal prepareJournal(final Document doc, final String xpath, final DataInfo info) { if (StringUtils.isNotBlank(sj)) { final
* String[] arr = sj.split("@@@"); if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final
* String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
* if (issn != null || eissn != null || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null,
* null, info); } } } return null; }
*/
@Override
public void close() throws IOException {
super.close();
mdstoreClient.close();
}
} }

View File

@ -1,154 +0,0 @@
package eu.dnetlib.dhp.migration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class MigrationUtils {
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); }
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays.stream(values).map(v -> field(v, info)).collect(Collectors.toList());
}
public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static StructuredProperty structuredProperty(final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier(classid, classname, schemeid, schemename));
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
}
public static DataInfo dataInfo(final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(final String prefix, final String originalId) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
}
}

View File

@ -0,0 +1,246 @@
package eu.dnetlib.dhp.migration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OafMigrationExecutor extends AbstractMongoExecutor {
public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
}
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
@Override
protected void registerNamespaces(final Map<String, String> nsContext) {
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
}
@Override
protected void addRelations(final List<Oaf> oafs,
final Document doc,
final String type,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
for (final Object o : doc.selectNodes("//")) { // TODO
final Node n = (Node) o;
final Relation r = new Relation();
r.setRelType(null); // TODO
r.setSubRelType(null); // TODO
r.setRelClass(null); // TODO
r.setSource(null); // TODO
r.setTarget(null); // TODO
r.setCollectedFrom(Arrays.asList(collectedFrom));
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
oafs.add(r);
}
}
@Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>();
int pos = 1;
for (final Object o : doc.selectNodes("//dc:creator")) {
final Node n = (Node) o;
final Author author = new Author();
author.setFullname(n.getText());
author.setRank(pos++);
}
return res;
}
@Override
protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info);
}
@Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
}
@Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description", info);
}
@Override
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:publisher", info);
}
@Override
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:format", info);
}
@Override
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor", info);
}
@Override
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:coverage", info);
}
@Override
protected List<Instance> prepareInstances(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:source", info);
}
@Override
protected Field<String> prepareEmbargoEndDate(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
@Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub
return null;
}
/*
* private StructuredProperty prepareStructProp(final Document doc, final String xpath, final DataInfo dataInfo) { if
* (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); if (parts.length == 2) { final String value =
* parts[0]; final String[] arr = parts[1].split("@@@"); if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2],
* arr[3], dataInfo); } } return null; }
*
* private List<StructuredProperty> prepareListOfStructProps(final Document doc, final String xpath, final DataInfo dataInfo) { final
* List<StructuredProperty> res = new ArrayList<>(); if (array != null) { for (final String s : (String[]) array.getArray()) { final
* StructuredProperty sp = prepareStructProp(s, dataInfo); if (sp != null) { res.add(sp); } } }
*
* return res; }
*
* private Journal prepareJournal(final Document doc, final String xpath, final DataInfo info) { if (StringUtils.isNotBlank(sj)) { final
* String[] arr = sj.split("@@@"); if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final
* String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
* if (issn != null || eissn != null || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null,
* null, info); } } } return null; }
*/
}