forked from D-Net/dnet-hadoop
partial implementation of migration
This commit is contained in:
parent
6bfe2dc96e
commit
fbb0fc140b
|
@ -30,7 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
public class AbstractMigrateApplication implements Closeable {
|
public class AbstractMigrationExecutor implements Closeable {
|
||||||
|
|
||||||
private final AtomicInteger counter = new AtomicInteger(0);
|
private final AtomicInteger counter = new AtomicInteger(0);
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ public class AbstractMigrateApplication implements Closeable {
|
||||||
|
|
||||||
private final SequenceFile.Writer writer;
|
private final SequenceFile.Writer writer;
|
||||||
|
|
||||||
public AbstractMigrateApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception {
|
public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception {
|
||||||
this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
|
this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
|
||||||
.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class));
|
.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class));
|
||||||
}
|
}
|
|
@ -0,0 +1,369 @@
|
||||||
|
package eu.dnetlib.dhp.migration;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.dom4j.Document;
|
||||||
|
import org.dom4j.DocumentException;
|
||||||
|
import org.dom4j.DocumentFactory;
|
||||||
|
import org.dom4j.DocumentHelper;
|
||||||
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
||||||
|
|
||||||
|
protected final Map<String, String> code2name = new HashMap<>();
|
||||||
|
|
||||||
|
protected final MdstoreClient mdstoreClient;
|
||||||
|
|
||||||
|
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||||
|
|
||||||
|
protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
|
||||||
|
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
|
||||||
|
protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
|
||||||
|
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
|
||||||
|
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
|
||||||
|
|
||||||
|
public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
|
||||||
|
final String mongoDb, final String dbUrl, final String dbUser,
|
||||||
|
final String dbPassword) throws Exception {
|
||||||
|
|
||||||
|
super(hdfsPath, hdfsNameNode, hdfsUser);
|
||||||
|
|
||||||
|
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
||||||
|
loadClassNames(dbUrl, dbUser, dbPassword);
|
||||||
|
|
||||||
|
final Map<String, String> nsContext = new HashMap<>();
|
||||||
|
|
||||||
|
registerNamespaces(nsContext);
|
||||||
|
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||||
|
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||||
|
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||||
|
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||||
|
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||||
|
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||||
|
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
|
||||||
|
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||||
|
code2name.clear();
|
||||||
|
dbClient.processResults("select code, name from class", rs -> {
|
||||||
|
try {
|
||||||
|
code2name.put(rs.getString("code"), rs.getString("name"));
|
||||||
|
} catch (final SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
|
||||||
|
|
||||||
|
for (final Entry<String, String> entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) {
|
||||||
|
// final String mdId = entry.getKey();
|
||||||
|
final String currentColl = entry.getValue();
|
||||||
|
|
||||||
|
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
||||||
|
final Document doc = DocumentHelper.parseText(xml);
|
||||||
|
|
||||||
|
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||||
|
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
|
||||||
|
final DataInfo info = prepareDataInfo(doc);
|
||||||
|
final long lastUpdateTimestamp = new Date().getTime();
|
||||||
|
|
||||||
|
for (final Oaf oaf : createOafs(doc, type, collectedFrom, info, lastUpdateTimestamp)) {
|
||||||
|
emitOaf(oaf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract void registerNamespaces(Map<String, String> nsContext);
|
||||||
|
|
||||||
|
protected List<Oaf> createOafs(final Document doc, final String type, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
|
||||||
|
|
||||||
|
final List<Oaf> oafs = new ArrayList<>();
|
||||||
|
|
||||||
|
switch (type.toLowerCase()) {
|
||||||
|
case "":
|
||||||
|
case "publication":
|
||||||
|
final Publication p = new Publication();
|
||||||
|
populateResultFields(p, doc, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
|
||||||
|
p.setJournal(prepareJournal(doc, info));
|
||||||
|
oafs.add(p);
|
||||||
|
break;
|
||||||
|
case "dataset":
|
||||||
|
final Dataset d = new Dataset();
|
||||||
|
populateResultFields(d, doc, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
|
||||||
|
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
||||||
|
d.setDevice(prepareDatasetDevice(doc, info));
|
||||||
|
d.setSize(prepareDatasetSize(doc, info));
|
||||||
|
d.setVersion(prepareDatasetVersion(doc, info));
|
||||||
|
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
|
||||||
|
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
|
||||||
|
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
|
||||||
|
oafs.add(d);
|
||||||
|
break;
|
||||||
|
case "software":
|
||||||
|
final Software s = new Software();
|
||||||
|
populateResultFields(s, doc, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
|
||||||
|
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
||||||
|
s.setLicense(prepareSoftwareLicenses(doc, info));
|
||||||
|
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
||||||
|
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
|
||||||
|
oafs.add(s);
|
||||||
|
break;
|
||||||
|
case "otherresearchproducts":
|
||||||
|
default:
|
||||||
|
final OtherResearchProduct o = new OtherResearchProduct();
|
||||||
|
populateResultFields(o, doc, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
|
||||||
|
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
||||||
|
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
||||||
|
o.setTool(prepareOtherResearchProductTools(doc, info));
|
||||||
|
oafs.add(o);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!oafs.isEmpty()) {
|
||||||
|
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
||||||
|
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
||||||
|
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
||||||
|
}
|
||||||
|
|
||||||
|
return oafs;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
|
||||||
|
r.setDataInfo(info);
|
||||||
|
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier")));
|
||||||
|
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
|
||||||
|
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
||||||
|
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
|
||||||
|
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
|
||||||
|
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
|
||||||
|
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
|
r.setOaiprovenance(prepareOAIprovenance(doc));
|
||||||
|
r.setAuthor(prepareAuthors(doc, info));
|
||||||
|
r.setLanguage(prepareLanguages(doc));
|
||||||
|
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
|
r.setSubject(prepareSubjects(doc, info));
|
||||||
|
r.setTitle(prepareTitles(doc, info));
|
||||||
|
r.setRelevantdate(prepareRelevantDates(doc, info));
|
||||||
|
r.setDescription(prepareDescriptions(doc, info));
|
||||||
|
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
|
||||||
|
r.setPublisher(preparePublisher(doc, info));
|
||||||
|
r.setEmbargoenddate(prepareEmbargoEndDate(doc, info));
|
||||||
|
r.setSource(prepareSources(doc, info));
|
||||||
|
r.setFulltext(null); // NOT PRESENT IN MDSTORES
|
||||||
|
r.setFormat(prepareFormats(doc, info));
|
||||||
|
r.setContributor(prepareContributors(doc, info));
|
||||||
|
r.setResourcetype(null); // TODO
|
||||||
|
r.setCoverage(prepareCoverages(doc, info));
|
||||||
|
r.setRefereed(null); // TODO
|
||||||
|
r.setContext(null); // TODO
|
||||||
|
r.setExternalReference(null); // TODO
|
||||||
|
r.setInstance(prepareInstances(doc, info));
|
||||||
|
r.setProcessingchargeamount(null); // TODO
|
||||||
|
r.setProcessingchargecurrency(null); // TODO
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> prepareEmbargoEndDate(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> preparePublisher(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Qualifier prepareLanguages(Document doc);
|
||||||
|
|
||||||
|
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareOtherResearchProductTools(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||||
|
|
||||||
|
abstract protected void addRelations(final List<Oaf> oafs,
|
||||||
|
final Document doc,
|
||||||
|
final String type,
|
||||||
|
final KeyValue collectedFrom,
|
||||||
|
final DataInfo info,
|
||||||
|
final long lastUpdateTimestamp);
|
||||||
|
|
||||||
|
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
||||||
|
final Node n = doc.selectSingleNode("//oaf:journal");
|
||||||
|
if (n != null) {
|
||||||
|
final String name = n.getText();
|
||||||
|
final String issnPrinted = n.valueOf("@issn");
|
||||||
|
final String issnOnline = n.valueOf("@eissn");
|
||||||
|
final String issnLinking = n.valueOf("@lissn");
|
||||||
|
if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, null, null, null, null, null, null, null, info); }
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) {
|
||||||
|
final String classId = doc.valueOf(xpath);
|
||||||
|
final String className = code2name.get(classId);
|
||||||
|
return qualifier(classId, className, schemeId, schemeName);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<StructuredProperty> prepareListStructProps(final Document doc,
|
||||||
|
final String xpath,
|
||||||
|
final String xpathClassId,
|
||||||
|
final String schemeId,
|
||||||
|
final String schemeName,
|
||||||
|
final DataInfo info) {
|
||||||
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
|
for (final Object o : doc.selectNodes(xpath)) {
|
||||||
|
final Node n = (Node) o;
|
||||||
|
final String classId = n.valueOf(xpathClassId);
|
||||||
|
final String className = code2name.get(classId);
|
||||||
|
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) {
|
||||||
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
|
for (final Object o : doc.selectNodes(xpath)) {
|
||||||
|
final Node n = (Node) o;
|
||||||
|
res.add(structuredProperty(n.getText(), qualifier, info));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final DataInfo info) {
|
||||||
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
|
for (final Object o : doc.selectNodes(xpath)) {
|
||||||
|
final Node n = (Node) o;
|
||||||
|
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
|
||||||
|
.valueOf("@schemename"), info));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected OAIProvenance prepareOAIprovenance(final Document doc) {
|
||||||
|
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
|
||||||
|
|
||||||
|
final String identifier = n.valueOf("./*[local-name()='identifier']");
|
||||||
|
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
|
||||||
|
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
|
||||||
|
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
|
||||||
|
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
|
||||||
|
final String harvestDate = n.valueOf("@harvestDate");;
|
||||||
|
|
||||||
|
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected DataInfo prepareDataInfo(final Document doc) {
|
||||||
|
final Node n = doc.selectSingleNode("//oaf:datainfo");
|
||||||
|
|
||||||
|
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
|
||||||
|
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
|
||||||
|
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
|
||||||
|
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
|
||||||
|
|
||||||
|
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
|
||||||
|
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
|
||||||
|
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
|
||||||
|
final String trust = n.valueOf("./oaf:trust");
|
||||||
|
|
||||||
|
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Field<String> prepareField(final Document doc, final String xpath, final DataInfo info) {
|
||||||
|
return field(doc.valueOf(xpath), info);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<Field<String>> prepareListFields(final Document doc, final String xpath, final DataInfo info) {
|
||||||
|
return listFields(info, (String[]) prepareListString(doc, xpath).toArray());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<String> prepareListString(final Document doc, final String xpath) {
|
||||||
|
final List<String> res = new ArrayList<>();
|
||||||
|
for (final Object o : doc.selectNodes(xpath)) {
|
||||||
|
final String s = ((Node) o).getText().trim();
|
||||||
|
if (StringUtils.isNotBlank(s)) {
|
||||||
|
res.add(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
super.close();
|
||||||
|
mdstoreClient.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -28,7 +28,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class MigrateDbEntitiesApplication extends AbstractMigrateApplication implements Closeable {
|
public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable {
|
||||||
|
|
||||||
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
|
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
|
||||||
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
|
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
|
||||||
|
|
|
@ -1,56 +1,10 @@
|
||||||
package eu.dnetlib.dhp.migration;
|
package eu.dnetlib.dhp.migration;
|
||||||
|
|
||||||
import java.io.Closeable;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Map.Entry;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
import org.dom4j.Document;
|
|
||||||
import org.dom4j.DocumentException;
|
|
||||||
import org.dom4j.DocumentFactory;
|
|
||||||
import org.dom4j.DocumentHelper;
|
|
||||||
import org.dom4j.Node;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication implements Closeable {
|
public class MigrateMongoMdstoresApplication {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
|
|
||||||
|
|
||||||
private final Map<String, String> code2name = new HashMap<>();
|
|
||||||
|
|
||||||
private final MdstoreClient mdstoreClient;
|
|
||||||
|
|
||||||
private static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
|
||||||
|
|
||||||
private static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
|
|
||||||
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
|
|
||||||
private static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
|
|
||||||
private static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
|
|
||||||
private static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
|
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -72,294 +26,17 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication
|
||||||
final String dbUser = parser.get("postgresUser");
|
final String dbUser = parser.get("postgresUser");
|
||||||
final String dbPassword = parser.get("postgresPassword");
|
final String dbPassword = parser.get("postgresPassword");
|
||||||
|
|
||||||
try (final MigrateMongoMdstoresApplication mig =
|
if (mdFormat.equalsIgnoreCase("oaf")) {
|
||||||
new MigrateMongoMdstoresApplication(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
|
try (final OafMigrationExecutor mig =
|
||||||
|
new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
|
||||||
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
|
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
|
||||||
}
|
}
|
||||||
|
} else if (mdFormat.equalsIgnoreCase("oaf")) {
|
||||||
|
|
||||||
}
|
} else {
|
||||||
|
throw new RuntimeException("Format not supported: " + mdFormat);
|
||||||
public MigrateMongoMdstoresApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
|
|
||||||
final String mongoDb, final String dbUrl, final String dbUser,
|
|
||||||
final String dbPassword) throws Exception {
|
|
||||||
super(hdfsPath, hdfsNameNode, hdfsUser);
|
|
||||||
|
|
||||||
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
|
||||||
loadClassNames(dbUrl, dbUser, dbPassword);
|
|
||||||
|
|
||||||
final Map<String, String> nsContext = new HashMap<>();
|
|
||||||
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
|
||||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
|
||||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
|
||||||
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
|
||||||
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
|
||||||
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
|
||||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
|
|
||||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
|
||||||
code2name.clear();
|
|
||||||
dbClient.processResults("select code, name from class", rs -> {
|
|
||||||
try {
|
|
||||||
code2name.put(rs.getString("code"), rs.getString("name"));
|
|
||||||
} catch (final SQLException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
|
|
||||||
|
|
||||||
for (final Entry<String, String> entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) {
|
|
||||||
// final String mdId = entry.getKey();
|
|
||||||
final String currentColl = entry.getValue();
|
|
||||||
|
|
||||||
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
|
||||||
for (final Oaf oaf : createOafs(xml)) {
|
|
||||||
emitOaf(oaf);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<Oaf> createOafs(final String xml) throws DocumentException {
|
|
||||||
|
|
||||||
final Document doc = DocumentHelper.parseText(xml);
|
|
||||||
|
|
||||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
|
||||||
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
|
|
||||||
final DataInfo info = prepareDataInfo(doc);
|
|
||||||
final long lastUpdateTimestamp = new Date().getTime();
|
|
||||||
|
|
||||||
final List<Oaf> oafs = new ArrayList<>();
|
|
||||||
|
|
||||||
switch (type.toLowerCase()) {
|
|
||||||
case "":
|
|
||||||
case "publication":
|
|
||||||
final Publication p = new Publication();
|
|
||||||
populateResultFields(p, doc, collectedFrom, info, lastUpdateTimestamp);
|
|
||||||
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
|
|
||||||
p.setJournal(null); // TODO
|
|
||||||
oafs.add(p);
|
|
||||||
break;
|
|
||||||
case "dataset":
|
|
||||||
final Dataset d = new Dataset();
|
|
||||||
populateResultFields(d, doc, collectedFrom, info, lastUpdateTimestamp);
|
|
||||||
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
|
|
||||||
d.setStoragedate(null); // TODO
|
|
||||||
d.setDevice(null); // TODO
|
|
||||||
d.setSize(null); // TODO
|
|
||||||
d.setVersion(null); // TODO
|
|
||||||
d.setLastmetadataupdate(null); // TODO
|
|
||||||
d.setMetadataversionnumber(null); // TODO
|
|
||||||
d.setGeolocation(null); // TODO
|
|
||||||
oafs.add(d);
|
|
||||||
break;
|
|
||||||
case "otherresearchproducts":
|
|
||||||
|
|
||||||
case "software":
|
|
||||||
final Software s = new Software();
|
|
||||||
populateResultFields(s, doc, collectedFrom, info, lastUpdateTimestamp);
|
|
||||||
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
|
|
||||||
s.setDocumentationUrl(null); // TODO
|
|
||||||
s.setLicense(null); // TODO
|
|
||||||
s.setCodeRepositoryUrl(null); // TODO
|
|
||||||
s.setProgrammingLanguage(null); // TODO
|
|
||||||
oafs.add(s);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
final OtherResearchProduct o = new OtherResearchProduct();
|
|
||||||
populateResultFields(o, doc, collectedFrom, info, lastUpdateTimestamp);
|
|
||||||
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
|
|
||||||
o.setContactperson(null); // TODO
|
|
||||||
o.setContactgroup(null); // TODO
|
|
||||||
o.setTool(null); // TODO
|
|
||||||
oafs.add(o);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!oafs.isEmpty()) {
|
|
||||||
addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
|
||||||
addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
|
||||||
addRelations(oafs, doc, "//*", "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
|
||||||
}
|
|
||||||
|
|
||||||
return oafs;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addRelations(final List<Oaf> oafs,
|
|
||||||
final Document doc,
|
|
||||||
final String xpath,
|
|
||||||
final String type,
|
|
||||||
final KeyValue collectedFrom,
|
|
||||||
final DataInfo info,
|
|
||||||
final long lastUpdateTimestamp) {
|
|
||||||
for (final Object o : doc.selectNodes(xpath)) {
|
|
||||||
final Node n = (Node) o;
|
|
||||||
final Relation r = new Relation();
|
|
||||||
r.setRelType(null); // TODO
|
|
||||||
r.setSubRelType(null); // TODO
|
|
||||||
r.setRelClass(null); // TODO
|
|
||||||
r.setSource(null); // TODO
|
|
||||||
r.setTarget(null); // TODO
|
|
||||||
r.setCollectedFrom(Arrays.asList(collectedFrom));
|
|
||||||
r.setDataInfo(info);
|
|
||||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
|
||||||
oafs.add(r);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
|
|
||||||
|
|
||||||
r.setDataInfo(info);
|
|
||||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
|
||||||
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier")));
|
|
||||||
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
|
|
||||||
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
|
||||||
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
|
|
||||||
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
|
|
||||||
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
|
|
||||||
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
|
||||||
r.setOaiprovenance(prepareOAIprovenance(doc));
|
|
||||||
r.setAuthor(null); // TODO
|
|
||||||
r.setLanguage(prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"));
|
|
||||||
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
|
||||||
r.setSubject(prepareListStructProps(doc, "//dc:subject", info));
|
|
||||||
r.setTitle(prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info));
|
|
||||||
r.setRelevantdate(null); // TODO
|
|
||||||
r.setDescription(prepareListFields(doc, "//dc:description", info));
|
|
||||||
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
|
|
||||||
r.setPublisher(prepareField(doc, "//dc:publisher", info));
|
|
||||||
r.setEmbargoenddate(null); // TODO
|
|
||||||
r.setSource(null); // TODO
|
|
||||||
r.setFulltext(null); // TODO
|
|
||||||
r.setFormat(prepareListFields(doc, "//dc:format", info));
|
|
||||||
r.setContributor(prepareListFields(doc, "//dc:contributor", info));
|
|
||||||
r.setResourcetype(null); // TODO
|
|
||||||
r.setCoverage(prepareListFields(doc, "//dc:coverage", info));
|
|
||||||
r.setRefereed(null); // TODO
|
|
||||||
r.setContext(null); // TODO
|
|
||||||
r.setExternalReference(null); // TODO
|
|
||||||
r.setInstance(null); // TODO
|
|
||||||
r.setProcessingchargeamount(null); // TODO
|
|
||||||
r.setProcessingchargecurrency(null); // TODO
|
|
||||||
}
|
|
||||||
|
|
||||||
private Qualifier prepareQualifier(final Document doc, final String xpath, final String schemeId, final String schemeName) {
|
|
||||||
final String classId = doc.valueOf(xpath);
|
|
||||||
final String className = code2name.get(classId);
|
|
||||||
return qualifier(classId, className, schemeId, schemeName);
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<StructuredProperty> prepareListStructProps(final Document doc,
|
|
||||||
final String xpath,
|
|
||||||
final String xpathClassId,
|
|
||||||
final String schemeId,
|
|
||||||
final String schemeName,
|
|
||||||
final DataInfo info) {
|
|
||||||
final List<StructuredProperty> res = new ArrayList<>();
|
|
||||||
for (final Object o : doc.selectNodes(xpath)) {
|
|
||||||
final Node n = (Node) o;
|
|
||||||
final String classId = n.valueOf(xpathClassId);
|
|
||||||
final String className = code2name.get(classId);
|
|
||||||
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final Qualifier qualifier, final DataInfo info) {
|
|
||||||
final List<StructuredProperty> res = new ArrayList<>();
|
|
||||||
for (final Object o : doc.selectNodes(xpath)) {
|
|
||||||
final Node n = (Node) o;
|
|
||||||
res.add(structuredProperty(n.getText(), qualifier, info));
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<StructuredProperty> prepareListStructProps(final Document doc, final String xpath, final DataInfo info) {
|
|
||||||
final List<StructuredProperty> res = new ArrayList<>();
|
|
||||||
for (final Object o : doc.selectNodes(xpath)) {
|
|
||||||
final Node n = (Node) o;
|
|
||||||
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
|
|
||||||
.valueOf("@schemename"), info));
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
private OAIProvenance prepareOAIprovenance(final Document doc) {
|
|
||||||
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
|
|
||||||
|
|
||||||
final String identifier = n.valueOf("./*[local-name()='identifier']");
|
|
||||||
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
|
|
||||||
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
|
|
||||||
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
|
|
||||||
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
|
|
||||||
final String harvestDate = n.valueOf("@harvestDate");;
|
|
||||||
|
|
||||||
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
|
|
||||||
}
|
|
||||||
|
|
||||||
private DataInfo prepareDataInfo(final Document doc) {
|
|
||||||
final Node n = doc.selectSingleNode("//oaf:datainfo");
|
|
||||||
|
|
||||||
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
|
|
||||||
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
|
|
||||||
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
|
|
||||||
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
|
|
||||||
|
|
||||||
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
|
|
||||||
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
|
|
||||||
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
|
|
||||||
final String trust = n.valueOf("./oaf:trust");
|
|
||||||
|
|
||||||
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Field<String> prepareField(final Document doc, final String xpath, final DataInfo info) {
|
|
||||||
return field(doc.valueOf(xpath), info);
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<Field<String>> prepareListFields(final Document doc, final String xpath, final DataInfo info) {
|
|
||||||
return listFields(info, (String[]) prepareListString(doc, xpath).toArray());
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> prepareListString(final Document doc, final String xpath) {
|
|
||||||
final List<String> res = new ArrayList<>();
|
|
||||||
for (final Object o : doc.selectNodes(xpath)) {
|
|
||||||
final String s = ((Node) o).getText().trim();
|
|
||||||
if (StringUtils.isNotBlank(s)) {
|
|
||||||
res.add(s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* private StructuredProperty prepareStructProp(final Document doc, final String xpath, final DataInfo dataInfo) { if
|
|
||||||
* (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); if (parts.length == 2) { final String value =
|
|
||||||
* parts[0]; final String[] arr = parts[1].split("@@@"); if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2],
|
|
||||||
* arr[3], dataInfo); } } return null; }
|
|
||||||
*
|
|
||||||
* private List<StructuredProperty> prepareListOfStructProps(final Document doc, final String xpath, final DataInfo dataInfo) { final
|
|
||||||
* List<StructuredProperty> res = new ArrayList<>(); if (array != null) { for (final String s : (String[]) array.getArray()) { final
|
|
||||||
* StructuredProperty sp = prepareStructProp(s, dataInfo); if (sp != null) { res.add(sp); } } }
|
|
||||||
*
|
|
||||||
* return res; }
|
|
||||||
*
|
|
||||||
* private Journal prepareJournal(final Document doc, final String xpath, final DataInfo info) { if (StringUtils.isNotBlank(sj)) { final
|
|
||||||
* String[] arr = sj.split("@@@"); if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final
|
|
||||||
* String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
|
|
||||||
* if (issn != null || eissn != null || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null,
|
|
||||||
* null, info); } } } return null; }
|
|
||||||
*/
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
super.close();
|
|
||||||
mdstoreClient.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,154 +0,0 @@
|
||||||
package eu.dnetlib.dhp.migration;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
|
|
||||||
public class MigrationUtils {
|
|
||||||
|
|
||||||
public static KeyValue keyValue(final String k, final String v) {
|
|
||||||
final KeyValue kv = new KeyValue();
|
|
||||||
kv.setKey(k);
|
|
||||||
kv.setValue(v);
|
|
||||||
return kv;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<KeyValue> listKeyValues(final String... s) {
|
|
||||||
if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); }
|
|
||||||
|
|
||||||
final List<KeyValue> list = new ArrayList<>();
|
|
||||||
for (int i = 0; i < s.length; i += 2) {
|
|
||||||
list.add(keyValue(s[i], s[i + 1]));
|
|
||||||
}
|
|
||||||
return list;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T> Field<T> field(final T value, final DataInfo info) {
|
|
||||||
final Field<T> field = new Field<>();
|
|
||||||
field.setValue(value);
|
|
||||||
field.setDataInfo(info);
|
|
||||||
return field;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
|
|
||||||
return Arrays.stream(values).map(v -> field(v, info)).collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) {
|
|
||||||
final Qualifier q = new Qualifier();
|
|
||||||
q.setClassid(classid);
|
|
||||||
q.setClassname(classname);
|
|
||||||
q.setSchemeid(schemeid);
|
|
||||||
q.setSchemename(schemename);
|
|
||||||
return q;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static StructuredProperty structuredProperty(final String value,
|
|
||||||
final String classid,
|
|
||||||
final String classname,
|
|
||||||
final String schemeid,
|
|
||||||
final String schemename,
|
|
||||||
final DataInfo dataInfo) {
|
|
||||||
final StructuredProperty sp = new StructuredProperty();
|
|
||||||
sp.setValue(value);
|
|
||||||
sp.setQualifier(qualifier(classid, classname, schemeid, schemename));
|
|
||||||
sp.setDataInfo(dataInfo);
|
|
||||||
return sp;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) {
|
|
||||||
final ExtraInfo info = new ExtraInfo();
|
|
||||||
info.setName(name);
|
|
||||||
info.setValue(value);
|
|
||||||
info.setTypology(typology);
|
|
||||||
info.setProvenance(provenance);
|
|
||||||
info.setTrust(trust);
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static OAIProvenance oaiIProvenance(final String identifier,
|
|
||||||
final String baseURL,
|
|
||||||
final String metadataNamespace,
|
|
||||||
final Boolean altered,
|
|
||||||
final String datestamp,
|
|
||||||
final String harvestDate) {
|
|
||||||
|
|
||||||
final OriginDescription desc = new OriginDescription();
|
|
||||||
desc.setIdentifier(identifier);
|
|
||||||
desc.setBaseURL(baseURL);
|
|
||||||
desc.setMetadataNamespace(metadataNamespace);
|
|
||||||
desc.setAltered(altered);
|
|
||||||
desc.setDatestamp(datestamp);
|
|
||||||
desc.setHarvestDate(harvestDate);
|
|
||||||
|
|
||||||
final OAIProvenance p = new OAIProvenance();
|
|
||||||
p.setOriginDescription(desc);
|
|
||||||
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Journal journal(final String name,
|
|
||||||
final String issnPrinted,
|
|
||||||
final String issnOnline,
|
|
||||||
final String issnLinking,
|
|
||||||
final String ep,
|
|
||||||
final String iss,
|
|
||||||
final String sp,
|
|
||||||
final String vol,
|
|
||||||
final String edition,
|
|
||||||
final String conferenceplace,
|
|
||||||
final String conferencedate,
|
|
||||||
final DataInfo dataInfo) {
|
|
||||||
final Journal j = new Journal();
|
|
||||||
j.setName(name);
|
|
||||||
j.setIssnPrinted(issnPrinted);
|
|
||||||
j.setIssnOnline(issnOnline);
|
|
||||||
j.setIssnLinking(issnLinking);
|
|
||||||
j.setEp(ep);
|
|
||||||
j.setIss(iss);
|
|
||||||
j.setSp(sp);
|
|
||||||
j.setVol(vol);
|
|
||||||
j.setEdition(edition);
|
|
||||||
j.setConferenceplace(conferenceplace);
|
|
||||||
j.setConferencedate(conferencedate);
|
|
||||||
j.setDataInfo(dataInfo);
|
|
||||||
return j;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static DataInfo dataInfo(final Boolean deletedbyinference,
|
|
||||||
final String inferenceprovenance,
|
|
||||||
final Boolean inferred,
|
|
||||||
final Boolean invisible,
|
|
||||||
final Qualifier provenanceaction,
|
|
||||||
final String trust) {
|
|
||||||
final DataInfo d = new DataInfo();
|
|
||||||
d.setDeletedbyinference(deletedbyinference);
|
|
||||||
d.setInferenceprovenance(inferenceprovenance);
|
|
||||||
d.setInferred(inferred);
|
|
||||||
d.setInvisible(invisible);
|
|
||||||
d.setProvenanceaction(provenanceaction);
|
|
||||||
d.setTrust(trust);
|
|
||||||
return d;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String createOpenaireId(final String prefix, final String originalId) {
|
|
||||||
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
|
|
||||||
final String rest = StringUtils.substringAfter(originalId, "::");
|
|
||||||
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,246 @@
|
||||||
|
package eu.dnetlib.dhp.migration;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.dom4j.Document;
|
||||||
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
public class OafMigrationExecutor extends AbstractMongoExecutor {
|
||||||
|
|
||||||
|
public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
|
||||||
|
final String dbUrl, final String dbUser,
|
||||||
|
final String dbPassword) throws Exception {
|
||||||
|
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||||
|
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||||
|
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||||
|
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||||
|
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||||
|
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||||
|
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void addRelations(final List<Oaf> oafs,
|
||||||
|
final Document doc,
|
||||||
|
final String type,
|
||||||
|
final KeyValue collectedFrom,
|
||||||
|
final DataInfo info,
|
||||||
|
final long lastUpdateTimestamp) {
|
||||||
|
for (final Object o : doc.selectNodes("//")) { // TODO
|
||||||
|
final Node n = (Node) o;
|
||||||
|
final Relation r = new Relation();
|
||||||
|
r.setRelType(null); // TODO
|
||||||
|
r.setSubRelType(null); // TODO
|
||||||
|
r.setRelClass(null); // TODO
|
||||||
|
r.setSource(null); // TODO
|
||||||
|
r.setTarget(null); // TODO
|
||||||
|
r.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||||
|
r.setDataInfo(info);
|
||||||
|
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
oafs.add(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
||||||
|
final List<Author> res = new ArrayList<>();
|
||||||
|
int pos = 1;
|
||||||
|
for (final Object o : doc.selectNodes("//dc:creator")) {
|
||||||
|
final Node n = (Node) o;
|
||||||
|
final Author author = new Author();
|
||||||
|
author.setFullname(n.getText());
|
||||||
|
author.setRank(pos++);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Qualifier prepareLanguages(final Document doc) {
|
||||||
|
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||||
|
return prepareListStructProps(doc, "//dc:subject", info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
|
||||||
|
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||||
|
return prepareListFields(doc, "//dc:description", info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
||||||
|
return prepareField(doc, "//dc:publisher", info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
||||||
|
return prepareListFields(doc, "//dc:format", info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
||||||
|
return prepareListFields(doc, "//dc:contributor", info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
|
||||||
|
return prepareListFields(doc, "//dc:coverage", info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Instance> prepareInstances(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||||
|
return prepareListFields(doc, "//dc:source", info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> prepareEmbargoEndDate(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* private StructuredProperty prepareStructProp(final Document doc, final String xpath, final DataInfo dataInfo) { if
|
||||||
|
* (StringUtils.isBlank(s)) { return null; } final String[] parts = s.split("###"); if (parts.length == 2) { final String value =
|
||||||
|
* parts[0]; final String[] arr = parts[1].split("@@@"); if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2],
|
||||||
|
* arr[3], dataInfo); } } return null; }
|
||||||
|
*
|
||||||
|
* private List<StructuredProperty> prepareListOfStructProps(final Document doc, final String xpath, final DataInfo dataInfo) { final
|
||||||
|
* List<StructuredProperty> res = new ArrayList<>(); if (array != null) { for (final String s : (String[]) array.getArray()) { final
|
||||||
|
* StructuredProperty sp = prepareStructProp(s, dataInfo); if (sp != null) { res.add(sp); } } }
|
||||||
|
*
|
||||||
|
* return res; }
|
||||||
|
*
|
||||||
|
* private Journal prepareJournal(final Document doc, final String xpath, final DataInfo info) { if (StringUtils.isNotBlank(sj)) { final
|
||||||
|
* String[] arr = sj.split("@@@"); if (arr.length == 3) { final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; final
|
||||||
|
* String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
|
||||||
|
* if (issn != null || eissn != null || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null,
|
||||||
|
* null, info); } } } return null; }
|
||||||
|
*/
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue