1
0
Fork 0

sync with master branch

This commit is contained in:
Claudio Atzori 2020-02-17 13:49:46 +01:00
commit 1b18fd4d54
31 changed files with 3437 additions and 381 deletions

View File

@ -76,6 +76,41 @@
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
<plugin>
<groupId>org.eclipse.m2e</groupId>
<artifactId>lifecycle-mapping</artifactId>
<version>1.0.0</version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-plugin-plugin
</artifactId>
<versionRange>
[3.2,)
</versionRange>
<goals>
<goal>descriptor</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore></ignore>
</action>
</pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -26,6 +26,12 @@
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
@ -45,6 +51,17 @@
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.2.10</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>

View File

@ -0,0 +1,236 @@
package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.codehaus.jackson.map.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class AbstractMigrationExecutor implements Closeable {
private final AtomicInteger counter = new AtomicInteger(0);
private final Text key = new Text();
private final Text value = new Text();
private final ObjectMapper objectMapper = new ObjectMapper();
private final SequenceFile.Writer writer;
private static final Log log = LogFactory.getLog(AbstractMigrationExecutor.class);
public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception {
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s, nameNode=%s, user=%s", hdfsPath, hdfsNameNode, hdfsUser));
this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
}
private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException {
final Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("HADOOP_USER_NAME", hdfsUser);
System.setProperty("hadoop.home.dir", "/");
FileSystem.get(URI.create(hdfsNameNode), conf);
return conf;
}
protected void emitOaf(final Oaf oaf) {
try {
key.set(counter.getAndIncrement() + ":" + oaf.getClass().getSimpleName().toLowerCase());
value.set(objectMapper.writeValueAsString(oaf));
writer.append(key, value);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws IOException {
writer.hflush();
writer.close();
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); }
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
if (value == null || StringUtils.isBlank(value.toString())) { return null; }
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
}
public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static StructuredProperty structuredProperty(final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) {
if (value == null) { return null; }
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
} else {
return null;
}
}
public static DataInfo dataInfo(final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(final int prefix, final String originalId) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
}

View File

@ -0,0 +1,439 @@
package eu.dnetlib.dhp.migration;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
protected final Map<String, String> code2name = new HashMap<>();
protected final MdstoreClient mdstoreClient;
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
private static final Log log = LogFactory.getLog(AbstractMongoExecutor.class);
public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
final String mongoDb, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser);
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
loadClassNames(dbUrl, dbUser, dbPassword);
final Map<String, String> nsContext = new HashMap<>();
registerNamespaces(nsContext);
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
}
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
log.info("Loading vocabulary terms from db...");
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
code2name.clear();
dbClient.processResults("select code, name from class", rs -> {
try {
code2name.put(rs.getString("code"), rs.getString("name"));
} catch (final SQLException e) {
e.printStackTrace();
}
});
}
log.info("Found " + code2name.size() + " terms.");
}
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
log.info(String.format("Searching mdstores (format: %s, layout: %s, interpretation: %s)", mdFormat, mdLayout, mdInterpretation));
final Map<String, String> colls = mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation);
log.info("Found " + colls.size() + " mdstores");
for (final Entry<String, String> entry : colls.entrySet()) {
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
final String currentColl = entry.getValue();
for (final String xml : mdstoreClient.listRecords(currentColl)) {
final Document doc = DocumentHelper.parseText(xml);
final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
: keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
final DataInfo info = prepareDataInfo(doc);
final long lastUpdateTimestamp = new Date().getTime();
for (final Oaf oaf : createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp)) {
emitOaf(oaf);
}
}
}
log.info("All Done.");
}
protected void registerNamespaces(final Map<String, String> nsContext) {
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
}
protected List<Oaf> createOafs(final Document doc,
final String type,
final KeyValue collectedFrom,
final KeyValue hostedBy,
final DataInfo info,
final long lastUpdateTimestamp) {
final List<Oaf> oafs = new ArrayList<>();
switch (type.toLowerCase()) {
case "":
case "publication":
final Publication p = new Publication();
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
p.setJournal(prepareJournal(doc, info));
oafs.add(p);
break;
case "dataset":
final Dataset d = new Dataset();
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
d.setStoragedate(prepareDatasetStorageDate(doc, info));
d.setDevice(prepareDatasetDevice(doc, info));
d.setSize(prepareDatasetSize(doc, info));
d.setVersion(prepareDatasetVersion(doc, info));
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
oafs.add(d);
break;
case "software":
final Software s = new Software();
populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
s.setLicense(prepareSoftwareLicenses(doc, info));
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
oafs.add(s);
break;
case "otherresearchproducts":
default:
final OtherResearchProduct o = new OtherResearchProduct();
populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
o.setTool(prepareOtherResearchProductTools(doc, info));
oafs.add(o);
break;
}
if (!oafs.isEmpty()) {
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
}
return oafs;
}
private List<Oaf> addProjectRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
final List<Oaf> res = new ArrayList<>();
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
for (final Object o : doc.selectNodes("//oaf:projectid")) {
final String projectId = createOpenaireId(40, ((Node) o).getText());
final Relation r1 = new Relation();
r1.setRelType("resultProject");
r1.setSubRelType("outcome");
r1.setRelClass("isProducedBy");
r1.setSource(docId);
r1.setTarget(projectId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
final Relation r2 = new Relation();
r2.setRelType("resultProject");
r2.setSubRelType("outcome");
r2.setRelClass("produces");
r2.setSource(projectId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
}
return res;
}
protected abstract List<Oaf> addOtherResultRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp);
private void populateResultFields(final Result r,
final Document doc,
final KeyValue collectedFrom,
final KeyValue hostedBy,
final DataInfo info,
final long lastUpdateTimestamp) {
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier")));
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(prepareAuthors(doc, info));
r.setLanguage(prepareLanguages(doc));
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setSubject(prepareSubjects(doc, info));
r.setTitle(prepareTitles(doc, info));
r.setRelevantdate(prepareRelevantDates(doc, info));
r.setDescription(prepareDescriptions(doc, info));
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
r.setSource(prepareSources(doc, info));
r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info));
r.setResourcetype(prepareResourceType(doc, info));
r.setCoverage(prepareCoverages(doc, info));
r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
}
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info);
protected abstract Field<String> preparePublisher(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
protected abstract Qualifier prepareLanguages(Document doc);
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductTools(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(Document doc, DataInfo info);
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(Document doc, DataInfo info);
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) {
final String name = n.getText();
final String issnPrinted = n.valueOf("@issn");
final String issnOnline = n.valueOf("@eissn");
final String issnLinking = n.valueOf("@lissn");
final String ep = n.valueOf("@ep");
final String iss = n.valueOf("@iss");
final String sp = n.valueOf("@sp");
final String vol = n.valueOf("@vol");
final String edition = n.valueOf("@edition");
if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); }
}
return null;
}
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) {
final String classId = node.valueOf(xpath);
final String className = code2name.get(classId);
return qualifier(classId, className, schemeId, schemeName);
}
protected List<StructuredProperty> prepareListStructProps(final Node node,
final String xpath,
final String xpathClassId,
final String schemeId,
final String schemeName,
final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId);
final String className = code2name.get(classId);
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
}
return res;
}
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), qualifier, info));
}
return res;
}
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
.valueOf("@schemename"), info));
}
return res;
}
protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
if (n == null) { return null; }
final String identifier = n.valueOf("./*[local-name()='identifier']");
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
final String harvestDate = n.valueOf("@harvestDate");;
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
}
protected DataInfo prepareDataInfo(final Document doc) {
final Node n = doc.selectSingleNode("//oaf:datainfo");
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
final String trust = n.valueOf("./oaf:trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
}
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
return field(node.valueOf(xpath), info);
}
protected List<Field<String>> prepareListFields(final Node node, final String xpath, final DataInfo info) {
return listFields(info, prepareListString(node, xpath));
}
protected List<String> prepareListString(final Node node, final String xpath) {
final List<String> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final String s = ((Node) o).getText().trim();
if (StringUtils.isNotBlank(s)) {
res.add(s);
}
}
return res;
}
@Override
public void close() throws IOException {
super.close();
mdstoreClient.close();
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.function.Consumer;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
private Connection connection;
public DbClient(final String address, final String login, final String password) {
try {
Class.forName("org.postgresql.Driver");
this.connection =
StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address);
this.connection.setAutoCommit(false);
} catch (final Exception e) {
log.error(e.getClass().getName() + ": " + e.getMessage());
throw new RuntimeException(e);
}
log.info("Opened database successfully");
}
public void processResults(final String sql, final Consumer<ResultSet> consumer) {
try (final Statement stmt = connection.createStatement()) {
stmt.setFetchSize(100);
try (final ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
consumer.accept(rs);
}
} catch (final SQLException e) {
throw new RuntimeException(e);
}
} catch (final SQLException e1) {
throw new RuntimeException(e1);
}
}
@Override
public void close() throws IOException {
try {
connection.close();
} catch (final SQLException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.migration;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
public class ExtractEntitiesFromHDFSJob {
private static List<String> folderNames = Arrays.asList("db_entities", "oaf_entities", "odf_entities");
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(ExtractEntitiesFromHDFSJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final String sourcePath = parser.get("sourcePath");
final String targetPath = parser.get("graphRawPath");
final String entity = parser.get("entity");
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<String> inputRdd = sc.emptyRDD();
folderNames.forEach(p -> inputRdd.union(
sc.sequenceFile(sourcePath+"/"+p, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.filter(k -> isEntityType(k._1(), entity))
.map(Tuple2::_2))
);
inputRdd.saveAsTextFile(targetPath+"/"+entity);
}
private static boolean isEntityType(final String item, final String entity) {
return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity);
}
}

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bson.Document;
import com.google.common.collect.Iterables;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
public class MdstoreClient implements Closeable {
private final MongoClient client;
private final MongoDatabase db;
private static final String COLL_METADATA = "metadata";
private static final String COLL_METADATA_MANAGER = "metadataManager";
private static final Log log = LogFactory.getLog(MdstoreClient.class);
public MdstoreClient(final String baseUrl, final String dbName) {
this.client = new MongoClient(new MongoClientURI(baseUrl));
this.db = getDb(client, dbName);
}
public Map<String, String> validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) {
final Map<String, String> transactions = new HashMap<>();
for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) {
final String mdId = entry.getString("mdId");
final String currentId = entry.getString("currentId");
if (StringUtils.isNoneBlank(mdId, currentId)) {
transactions.put(mdId, currentId);
}
}
final Map<String, String> res = new HashMap<>();
for (final Document entry : getColl(db, COLL_METADATA, true).find()) {
if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout)
&& entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) {
res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId")));
}
}
return res;
}
private MongoDatabase getDb(final MongoClient client, final String dbName) {
if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
log.warn(err);
throw new RuntimeException(err);
}
return client.getDatabase(dbName);
}
private MongoCollection<Document> getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) {
if (!Iterables.contains(db.listCollectionNames(), collName)) {
final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
log.warn(err);
if (abortIfMissing) {
throw new RuntimeException(err);
} else {
return null;
}
}
return db.getCollection(collName);
}
public Iterable<String> listRecords(final String collName) {
final MongoCollection<Document> coll = getColl(db, collName, false);
return coll == null ? new ArrayList<>()
: () -> StreamSupport.stream(coll.find().spliterator(), false)
.filter(e -> e.containsKey("body"))
.map(e -> e.getString("body"))
.iterator();
}
@Override
public void close() throws IOException {
client.close();
}
}

View File

@ -0,0 +1,520 @@
package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.sql.Array;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.function.Consumer;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable {
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
private final DbClient dbClient;
private final long lastUpdateTimestamp;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json")));
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("namenode");
final String hdfsUser = parser.get("hdfsUser");
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) {
log.info("Processing datasources...");
smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
log.info("Processing projects...");
smdbe.execute("queryProjects.sql", smdbe::processProject);
log.info("Processing orgs...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
log.info("Processing relations ds <-> orgs ...");
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
log.info("Processing projects <-> orgs ...");
smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization);
log.info("All done.");
}
}
public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser);
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.lastUpdateTimestamp = new Date().getTime();
}
public void execute(final String sqlFile, final Consumer<ResultSet> consumer) throws Exception {
final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile));
dbClient.processResults(sql, consumer);
}
public void processDatasource(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Datasource ds = new Datasource();
ds.setId(createOpenaireId(10, rs.getString("datasourceid")));
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
ds.setPid(new ArrayList<>());
ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
ds.setDateoftransformation(null); // Value not returned by the SQL query
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
ds.setOaiprovenance(null); // Values not present in the DB
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
ds.setOfficialname(field(rs.getString("officialname"), info));
ds.setEnglishname(field(rs.getString("englishname"), info));
ds.setWebsiteurl(field(rs.getString("websiteurl"), info));
ds.setLogourl(field(rs.getString("logourl"), info));
ds.setContactemail(field(rs.getString("contactemail"), info));
ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info));
ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info));
ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info));
ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info));
ds.setDescription(field(rs.getString("description"), info));
ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info));
ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info));
ds.setOdpolicies(field(rs.getString("odpolicies"), info));
ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info));
ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info));
ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info));
ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info));
ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info));
ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info));
ds.setDataprovider(field(rs.getBoolean("dataprovider"), info));
ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info));
ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info));
ds.setDatauploadtype(field(rs.getString("datauploadtype"), info));
ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info));
ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info));
ds.setVersioning(field(rs.getBoolean("versioning"), info));
ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info));
ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info));
ds.setPidsystems(field(rs.getString("pidsystems"), info));
ds.setCertificates(field(rs.getString("certificates"), info));
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("datasourceid");
// rs.getArray("identities");
// rs.getString("officialname");
// rs.getString("englishname");
// rs.getString("contactemail");
// rs.getString("openairecompatibility"); // COMPLEX ...@@@...
// rs.getString("websiteurl");
// rs.getString("logourl");
// rs.getArray("accessinfopackage");
// rs.getDouble("latitude");
// rs.getDouble("longitude");
// rs.getString("namespaceprefix");
// rs.getInt("odnumberofitems"); // NULL
// rs.getDate("odnumberofitemsdate"); // NULL
// rs.getArray("subjects");
// rs.getString("description");
// rs.getString("odpolicies"); // NULL
// rs.getArray("odlanguages");
// rs.getArray("odcontenttypes");
// rs.getBoolean("inferred"); // false
// rs.getBoolean("deletedbyinference");// false
// rs.getDouble("trust"); // 0.9
// rs.getString("inferenceprovenance"); // NULL
// rs.getDate("dateofcollection");
// rs.getDate("dateofvalidation");
// rs.getDate("releasestartdate");
// rs.getDate("releaseenddate");
// rs.getString("missionstatementurl");
// rs.getBoolean("dataprovider");
// rs.getBoolean("serviceprovider");
// rs.getString("databaseaccesstype");
// rs.getString("datauploadtype");
// rs.getString("databaseaccessrestriction");
// rs.getString("datauploadrestriction");
// rs.getBoolean("versioning");
// rs.getString("citationguidelineurl");
// rs.getString("qualitymanagementkind");
// rs.getString("pidsystems");
// rs.getString("certificates");
// rs.getArray("policies");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("datasourcetype"); // COMPLEX
// rs.getString("provenanceaction"); //
// 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions'
// AS provenanceaction,
// rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
emitOaf(ds);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public void processProject(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Project p = new Project();
p.setId(createOpenaireId(40, rs.getString("projectid")));
p.setOriginalId(Arrays.asList(rs.getString("projectid")));
p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
p.setPid(new ArrayList<>());
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
p.setExtraInfo(new ArrayList<>()); // Values not present in the DB
p.setOaiprovenance(null); // Values not present in the DB
p.setWebsiteurl(field(rs.getString("websiteurl"), info));
p.setCode(field(rs.getString("code"), info));
p.setAcronym(field(rs.getString("acronym"), info));
p.setTitle(field(rs.getString("title"), info));
p.setStartdate(field(asString(rs.getDate("startdate")), info));
p.setEnddate(field(asString(rs.getDate("enddate")), info));
p.setCallidentifier(field(rs.getString("callidentifier"), info));
p.setKeywords(field(rs.getString("keywords"), info));
p.setDuration(field(Integer.toString(rs.getInt("duration")), info));
p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info));
p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info));
p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info));
p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info));
p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype")));
p.setOptional1(field(rs.getString("optional1"), info));
p.setOptional2(field(rs.getString("optional2"), info));
p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info));
p.setContactfullname(field(rs.getString("contactfullname"), info));
p.setContactfax(field(rs.getString("contactfax"), info));
p.setContactphone(field(rs.getString("contactphone"), info));
p.setContactemail(field(rs.getString("contactemail"), info));
p.setSummary(field(rs.getString("summary"), info));
p.setCurrency(field(rs.getString("currency"), info));
p.setTotalcost(new Float(rs.getDouble("totalcost")));
p.setFundedamount(new Float(rs.getDouble("fundedamount")));
p.setDataInfo(info);
p.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("projectid");
// rs.getString("code");
// rs.getString("websiteurl");
// rs.getString("acronym");
// rs.getString("title");
// rs.getDate("startdate");
// rs.getDate("enddate");
// rs.getString("callidentifier");
// rs.getString("keywords");
// rs.getInt("duration");
// rs.getBoolean("ecsc39");
// rs.getBoolean("oamandatepublications");
// rs.getBoolean("ecarticle29_3");
// rs.getDate("dateofcollection");
// rs.getDate("dateoftransformation");
// rs.getBoolean("inferred");
// rs.getBoolean("deletedbyinference");
// rs.getDouble("trust");
// rs.getString("inferenceprovenance");
// rs.getString("optional1");
// rs.getString("optional2");
// rs.getString("jsonextrainfo");
// rs.getString("contactfullname");
// rs.getString("contactfax");
// rs.getString("contactphone");
// rs.getString("contactemail");
// rs.getString("summary");
// rs.getString("currency");
// rs.getDouble("totalcost");
// rs.getDouble("fundedamount");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("contracttype"); // COMPLEX
// rs.getString("provenanceaction"); // COMPLEX
// rs.getArray("pid");
// rs.getArray("subjects");
// rs.getArray("fundingtree");
emitOaf(p);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public void processOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Organization o = new Organization();
o.setId(createOpenaireId(20, rs.getString("organizationid")));
o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
o.setPid(new ArrayList<>());
o.setDateofcollection(asString(rs.getDate("dateofcollection")));
o.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
o.setExtraInfo(new ArrayList<>()); // Values not present in the DB
o.setOaiprovenance(null); // Values not present in the DB
o.setLegalshortname(field("legalshortname", info));
o.setLegalname(field("legalname", info));
o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query
o.setWebsiteurl(field("websiteurl", info));
o.setLogourl(field("logourl", info));
o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info));
o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info));
o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info));
o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info));
o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info));
o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info));
o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info));
o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info));
o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
o.setCountry(prepareQualifierSplitting(rs.getString("country")));
o.setDataInfo(info);
o.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("organizationid");
// rs.getString("legalshortname");
// rs.getString("legalname");
// rs.getString("websiteurl");
// rs.getString("logourl");
// rs.getBoolean("eclegalbody");
// rs.getBoolean("eclegalperson");
// rs.getBoolean("ecnonprofit");
// rs.getBoolean("ecresearchorganization");
// rs.getBoolean("echighereducation");
// rs.getBoolean("ecinternationalorganizationeurinterests");
// rs.getBoolean("ecinternationalorganization");
// rs.getBoolean("ecenterprise");
// rs.getBoolean("ecsmevalidated");
// rs.getBoolean("ecnutscode");
// rs.getDate("dateofcollection");
// rs.getDate("dateoftransformation");
// rs.getBoolean("inferred");
// rs.getBoolean("deletedbyinference");
// rs.getDouble("trust");
// rs.getString("inferenceprovenance");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("country");
// rs.getString("provenanceaction");
// rs.getArray("pid");
emitOaf(o);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public void processDatasourceOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("organization"));
final String dsId = createOpenaireId(10, rs.getString("datasource"));
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType("datasourceOrganization");
r1.setSubRelType("provision");
r1.setRelClass("isProvidedBy");
r1.setSource(dsId);
r1.setTarget(orgId);
r1.setCollectedFrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r1);
final Relation r2 = new Relation();
r2.setRelType("datasourceOrganization");
r2.setSubRelType("provision");
r2.setRelClass("provides");
r2.setSource(orgId);
r2.setTarget(dsId);
r2.setCollectedFrom(collectedFrom);
r2.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r2);
// rs.getString("datasource");
// rs.getString("organization");
// rs.getDate("startdate"); // NULL
// rs.getDate("enddate"); // NULL
// rs.getBoolean("inferred"); // false
// rs.getBoolean("deletedbyinference"); // false
// rs.getDouble("trust"); // 0.9
// rs.getString("inferenceprovenance"); // NULL
// rs.getString("semantics"); // 'providedBy@@@provided
// by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS
// semantics,
// rs.getString("provenanceaction"); // d.provenanceaction || '@@@' || d.provenanceaction ||
// '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public void processProjectOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("resporganization"));
final String projectId = createOpenaireId(40, rs.getString("project"));
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType("projectOrganization");
r1.setSubRelType("participation");
r1.setRelClass("isParticipant");
r1.setSource(projectId);
r1.setTarget(orgId);
r1.setCollectedFrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r1);
final Relation r2 = new Relation();
r2.setRelType("projectOrganization");
r2.setSubRelType("participation");
r2.setRelClass("hasParticipant");
r2.setSource(orgId);
r2.setTarget(projectId);
r2.setCollectedFrom(collectedFrom);
r2.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r2);
// rs.getString("project");
// rs.getString("resporganization");
// rs.getInt("participantnumber");
// rs.getDouble("contribution");
// rs.getDate("startdate");// null
// rs.getDate("enddate");// null
// rs.getBoolean("inferred");// false
// rs.getBoolean("deletedbyinference"); // false
// rs.getDouble("trust");
// rs.getString("inferenceprovenance"); // NULL
// rs.getString("semantics"); // po.semanticclass || '@@@' || po.semanticclass ||
// '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
// rs.getString("provenanceaction"); //
// 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions'
// AS provenanceaction
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException {
final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
final String inferenceprovenance = rs.getString("inferenceprovenance");
final Boolean inferred = rs.getBoolean("inferred");
final String trust = rs.getString("trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
}
private Qualifier prepareQualifierSplitting(final String s) {
if (StringUtils.isBlank(s)) { return null; }
final String[] arr = s.split("@@@");
return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null;
}
private List<Field<String>> prepareListFields(final Array array, final DataInfo info) {
try {
return listFields(info, (String[]) array.getArray());
} catch (final SQLException e) {
throw new RuntimeException("Invalid SQL array", e);
}
}
private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) {
if (StringUtils.isBlank(s)) { return null; }
final String[] parts = s.split("###");
if (parts.length == 2) {
final String value = parts[0];
final String[] arr = parts[1].split("@@@");
if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); }
}
return null;
}
private List<StructuredProperty> prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException {
final List<StructuredProperty> res = new ArrayList<>();
if (array != null) {
for (final String s : (String[]) array.getArray()) {
final StructuredProperty sp = prepareStructProp(s, dataInfo);
if (sp != null) {
res.add(sp);
}
}
}
return res;
}
private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
if (StringUtils.isNotBlank(sj)) {
final String[] arr = sj.split("@@@");
if (arr.length == 3) {
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null;
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;;
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
if (issn != null || eissn != null
|| lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); }
}
}
return null;
}
@Override
public void close() throws IOException {
super.close();
dbClient.close();
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.migration;
import org.apache.commons.io.IOUtils;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class MigrateMongoMdstoresApplication {
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
parser.parseArgument(args);
final String mongoBaseUrl = parser.get("mongoBaseUrl");
final String mongoDb = parser.get("mongoDb");
final String mdFormat = parser.get("mdFormat");
final String mdLayout = parser.get("mdLayout");
final String mdInterpretation = parser.get("mdInterpretation");
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("namenode");
final String hdfsUser = parser.get("hdfsUser");
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
if (mdFormat.equalsIgnoreCase("oaf")) {
try (final OafMigrationExecutor mig =
new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
}
} else if (mdFormat.equalsIgnoreCase("odf")) {
try (final OdfMigrationExecutor mig =
new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
}
} else {
throw new RuntimeException("Format not supported: " + mdFormat);
}
}
}

View File

@ -0,0 +1,251 @@
package eu.dnetlib.dhp.migration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.migration.pace.PacePerson;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OafMigrationExecutor extends AbstractMongoExecutor {
private static final Log log = LogFactory.getLog(OafMigrationExecutor.class);
public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
}
@Override
protected void registerNamespaces(final Map<String, String> nsContext) {
super.registerNamespaces(nsContext);
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
}
@Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>();
int pos = 1;
for (final Object o : doc.selectNodes("//dc:creator")) {
final Node n = (Node) o;
final Author author = new Author();
author.setFullname(n.getText());
author.setRank(pos++);
final PacePerson p = new PacePerson(n.getText(), false);
if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname());
}
res.add(author);
}
return res;
}
@Override
protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info);
}
@Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
}
@Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description", info);
}
@Override
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:publisher", info);
}
@Override
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:format", info);
}
@Override
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor", info);
}
@Override
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:coverage", info);
}
@Override
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
final List<Instance> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:identifier")) {
final String url = ((Node) o).getText().trim();
if (url.startsWith("http")) {
final Instance instance = new Instance();
instance.setUrl(Arrays.asList(url));
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
res.add(instance);
}
}
return res;
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:source", info);
}
@Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
// SOFTWARES
@Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
// DATASETS
@Override
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
// OTHER PRODUCTS
@Override
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Oaf> addOtherResultRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText());
final Relation r1 = new Relation();
r1.setRelType("resultResult");
r1.setSubRelType("publicationDataset");
r1.setRelClass("isRelatedTo");
r1.setSource(docId);
r1.setTarget(otherId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
final Relation r2 = new Relation();
r2.setRelType("resultResult");
r2.setSubRelType("publicationDataset");
r2.setRelClass("isRelatedTo");
r2.setSource(otherId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
}
return res;
}
@Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
}

View File

@ -0,0 +1,273 @@
package eu.dnetlib.dhp.migration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OdfMigrationExecutor extends AbstractMongoExecutor {
private static final Log log = LogFactory.getLog(OdfMigrationExecutor.class);
public OdfMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
}
@Override
protected void registerNamespaces(final Map<String, String> nsContext) {
super.registerNamespaces(nsContext);
nsContext.put("dc", "http://datacite.org/schema/kernel-3");
}
@Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
}
@Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>();
int pos = 1;
for (final Object o : doc.selectNodes("//dc:creator")) {
final Node n = (Node) o;
final Author author = new Author();
author.setFullname(n.valueOf("./dc:creatorName"));
author.setName(n.valueOf("./dc:givenName"));
author.setSurname(n.valueOf("./dc:familyName"));
author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info));
author.setPid(preparePids(doc, info));
author.setRank(pos++);
res.add(author);
}
return res;
}
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes("./dc:nameIdentifier")) {
res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
}
return res;
}
@Override
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
final List<Instance> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) {
final Instance instance = new Instance();
instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
res.add(instance);
}
return res;
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:date")) {
final String dateType = ((Node) o).valueOf("@dateType");
if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info));
}
}
return res;
}
@Override
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributorName", info);
}
@Override
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:format", info);
}
@Override
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:publisher", info);
}
@Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info);
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info);
}
@Override
protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
}
@Override
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info);
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info);
}
@Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages");
}
@Override
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
}
// DATASETS
@Override
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
final List<GeoLocation> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:geoLocation")) {
final GeoLocation loc = new GeoLocation();
loc.setBox(((Node) o).valueOf("./dc:geoLocationBox"));
loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace"));
loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint"));
res.add(loc);
}
return res;
}
@Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:date[@dateType='Updated']", info);
}
@Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:version", info);
}
@Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:size", info);
}
@Override
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:date[@dateType='Issued']", info);
}
@Override
protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText());
final String type = ((Node) o).valueOf("@relationType");
if (type.equals("IsSupplementTo")) {
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo"));
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy"));
} else if (type.equals("IsPartOf")) {
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf"));
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts"));
} else {}
}
return res;
}
private Relation prepareOtherResultRel(final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp,
final String source,
final String target,
final String subRelType,
final String relClass) {
final Relation r = new Relation();
r.setRelType("resultResult");
r.setSubRelType(subRelType);
r.setRelClass(relClass);
r.setSource(source);
r.setTarget(target);
r.setCollectedFrom(Arrays.asList(collectedFrom));
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
return r;
}
@Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource");
}
}

View File

@ -0,0 +1,176 @@
package eu.dnetlib.dhp.migration.pace;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
public class PacePerson {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private final String original;
private static Set<String> particles = null;
public static final String capitalize(final String s) {
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
}
public static final String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s;
}
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
public PacePerson(String s, final boolean aggressive) {
original = s;
s = Normalizer.normalize(s, Normalizer.Form.NFD);
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
s = s.replaceAll("\\{.+\\}", "");
s = s.replaceAll("\\s+-\\s+", "-");
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
s = s.replaceAll("\\d", " ");
s = s.replaceAll("\\n", " ");
s = s.replaceAll("\\.", " ");
s = s.replaceAll("\\s+", " ");
if (aggressive) {
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);
} else if (arr.length > 1) {
surname = splitTerms(arr[0]);
name = splitTerms(arr[1]);
fullname.addAll(surname);
fullname.addAll(name);
}
} else {
fullname = splitTerms(s);
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
}
}
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
name = fullname.subList(0, lastInitialPosition + 1);
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
for (final String term : fullname) {
if (term.length() > 1 && term.equals(term.toUpperCase())) {
surname.add(term);
} else {
name.add(term);
}
}
}
}
}
private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) {
list.add(part);
}
}
return list;
}
public List<String> getName() {
return name;
}
public String getNameString() {
return Joiner.on(" ").join(getName());
}
public List<String> getSurname() {
return surname;
}
public List<String> getFullname() {
return fullname;
}
public String getOriginal() {
return original;
}
public String hash() {
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
}
public String getNormalisedFirstName() {
return Joiner.on(" ").join(getCapitalFirstnames());
}
public String getNormalisedSurname() {
return Joiner.on(" ").join(getCapitalSurname());
}
public String getSurnameString() {
return Joiner.on(" ").join(getSurname());
}
public String getNormalisedFullname() {
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
}
public List<String> getCapitalFirstnames() {
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
}
public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
}
public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
}
public boolean isAccurate() {
return name != null && surname != null && !name.isEmpty() && !surname.isEmpty();
}
}

View File

@ -0,0 +1,26 @@
[
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the HDFS source path which contains the sequential file",
"paramRequired": true
},
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "g",
"paramLongName": "graphRawPath",
"paramDescription": "the path of the graph Raw in hdfs",
"paramRequired": true
},
{
"paramName": "e",
"paramLongName": "entity",
"paramDescription": "The entity to extract",
"paramRequired": true
}
]

View File

@ -0,0 +1,38 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "n",
"paramLongName": "namenode",
"paramDescription": "the Name Node URI",
"paramRequired": true
},
{
"paramName": "u",
"paramLongName": "hdfsUser",
"paramDescription": "the user wich create the hdfs seq file",
"paramRequired": true
},
{
"paramName": "dburl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "dbuser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "dbpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

View File

@ -0,0 +1,68 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "n",
"paramLongName": "namenode",
"paramDescription": "the Name Node URI",
"paramRequired": true
},
{
"paramName": "u",
"paramLongName": "hdfsUser",
"paramDescription": "the user wich create the hdfs seq file",
"paramRequired": true
},
{
"paramName": "mongourl",
"paramLongName": "mongoBaseUrl",
"paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]",
"paramRequired": true
},
{
"paramName": "db",
"paramLongName": "mongoDb",
"paramDescription": "mongo database",
"paramRequired": true
},
{
"paramName": "f",
"paramLongName": "mdFormat",
"paramDescription": "metadata format",
"paramRequired": true
},
{
"paramName": "l",
"paramLongName": "mdLayout",
"paramDescription": "metadata layout",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "mdInterpretation",
"paramDescription": "metadata interpretation",
"paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

View File

@ -0,0 +1,22 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hdfsUser</name>
<value>dnet</value>
</property>
</configuration>

View File

@ -0,0 +1,282 @@
<workflow-app name="import Entities from aggretor to HDFS" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>graphRawPath</name>
<description>the graph Raw base path</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongourl</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}'/>
<mkdir path='${workingPath}'/>
</fs>
<ok to="ImportEntitiesFromPostgres"/>
<error to="Kill"/>
</action>
<action name="ImportEntitiesFromPostgres">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${workingPath}/db_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-dburl</arg><arg>${postgresURL}</arg>
<arg>-dbuser</arg><arg>${postgresUser}</arg>
<arg>-dbpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportODFEntitiesFromMongoDB"/>
<error to="Kill"/>
</action>
<action name="ImportODFEntitiesFromMongoDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/odf_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-mongourl</arg><arg>${mongourl}</arg>
<arg>-db</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportOAFEntitiesFromMongoDB"/>
<error to="Kill"/>
</action>
<action name="ImportOAFEntitiesFromMongoDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/oaf_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-mongourl</arg><arg>${mongourl}</arg>
<arg>-db</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="ExtractPublication">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: publication</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/publication</arg>
<arg>-e</arg><arg>publication</arg>
</spark>
<ok to="ExtractDataset"/>
<error to="Kill"/>
</action>
<action name="ExtractDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: dataset</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/dataset</arg>
<arg>-e</arg><arg>dataset</arg>
</spark>
<ok to="ExtractSoftware"/>
<error to="Kill"/>
</action>
<action name="ExtractSoftware">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: software</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/software</arg>
<arg>-e</arg><arg>software</arg>
</spark>
<ok to="ExtractORP"/>
<error to="Kill"/>
</action>
<action name="ExtractORP">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: otherresearchproduct</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/otherresearchproduct</arg>
<arg>-e</arg><arg>otherresearchproduct</arg>
</spark>
<ok to="ExtractDatasource"/>
<error to="Kill"/>
</action>
<action name="ExtractDatasource">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: datasource</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/datasource</arg>
<arg>-e</arg><arg>datasource</arg>
</spark>
<ok to="ExtractOrganization"/>
<error to="Kill"/>
</action>
<action name="ExtractOrganization">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: organization</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/organization</arg>
<arg>-e</arg><arg>organization</arg>
</spark>
<ok to="ExtractProject"/>
<error to="Kill"/>
</action>
<action name="ExtractProject">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: project</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/project</arg>
<arg>-e</arg><arg>project</arg>
</spark>
<ok to="ExtractRelation"/>
<error to="Kill"/>
</action>
<action name="ExtractRelation">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: relation</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/relation</arg>
<arg>-e</arg><arg>relation</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,7 @@
van
der
de
dell
sig
mr
mrs

View File

@ -0,0 +1,17 @@
SELECT
dor.datasource AS datasource,
dor.organization AS organization,
NULL AS startdate,
NULL AS enddate,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics,
d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
FROM dsm_datasource_organization dor
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom)

View File

@ -0,0 +1,147 @@
SELECT
d.id AS datasourceid,
d.id || array_agg(distinct di.pid) AS identities,
d.officialname AS officialname,
d.englishname AS englishname,
d.contactemail AS contactemail,
CASE
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1'])
THEN
'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0'])
THEN
'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver'])
THEN
'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0'])
THEN
'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0'])
THEN
'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data'])
THEN
'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native'])
THEN
'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy'])
THEN
'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible'])
THEN
'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
ELSE
'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
END AS openairecompatibility,
d.websiteurl AS websiteurl,
d.logourl AS logourl,
array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage,
d.latitude AS latitude,
d.longitude AS longitude,
d.namespaceprefix AS namespaceprefix,
NULL AS odnumberofitems,
NULL AS odnumberofitemsdate,
(SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies')
FROM UNNEST(
ARRAY(
SELECT trim(s)
FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects,
d.description AS description,
NULL AS odpolicies,
ARRAY(SELECT trim(s)
FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages,
ARRAY(SELECT trim(s)
FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
NULL AS inferenceprovenance,
d.dateofcollection AS dateofcollection,
d.dateofvalidation AS dateofvalidation,
-- re3data fields
d.releasestartdate AS releasestartdate,
d.releaseenddate AS releaseenddate,
d.missionstatementurl AS missionstatementurl,
d.dataprovider AS dataprovider,
d.serviceprovider AS serviceprovider,
d.databaseaccesstype AS databaseaccesstype,
d.datauploadtype AS datauploadtype,
d.databaseaccessrestriction AS databaseaccessrestriction,
d.datauploadrestriction AS datauploadrestriction,
d.versioning AS versioning,
d.citationguidelineurl AS citationguidelineurl,
d.qualitymanagementkind AS qualitymanagementkind,
d.pidsystems AS pidsystems,
d.certificates AS certificates,
ARRAY[]::text[] AS policies,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
d.typology || '@@@' || CASE
WHEN (d.typology = 'crissystem') THEN 'CRIS System'
WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository'
WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator'
WHEN (d.typology = 'infospace') THEN 'Information Space'
WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository'
WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator'
WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal'
WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher'
WHEN (d.typology = 'pubsrepository::mock') THEN 'Other'
WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue'
WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository'
WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator'
WHEN (d.typology = 'entityregistry') THEN 'Registry'
WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure'
WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository'
WHEN (d.typology = 'websource') THEN 'Web Source'
WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database'
WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories'
WHEN (d.typology = 'softwarerepository') THEN 'Software Repository'
WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator'
WHEN (d.typology = 'orprepository') THEN 'Repository'
ELSE 'Other'
END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
FROM dsm_datasources d
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
GROUP BY
d.id,
d.officialname,
d.englishname,
d.websiteurl,
d.logourl,
d.contactemail,
d.namespaceprefix,
d.description,
d.latitude,
d.longitude,
d.dateofcollection,
d.dateofvalidation,
d.releasestartdate,
d.releaseenddate,
d.missionstatementurl,
d.dataprovider,
d.serviceprovider,
d.databaseaccesstype,
d.datauploadtype,
d.databaseaccessrestriction,
d.datauploadrestriction,
d.versioning,
d.citationguidelineurl,
d.qualitymanagementkind,
d.pidsystems,
d.certificates,
dc.id,
dc.officialname,
d.issn,
d.eissn,
d.lissn

View File

@ -0,0 +1,36 @@
SELECT
o.id AS organizationid,
o.legalshortname AS legalshortname,
o.legalname AS legalname,
o.websiteurl AS websiteurl,
o.logourl AS logourl,
o.ec_legalbody AS eclegalbody,
o.ec_legalperson AS eclegalperson,
o.ec_nonprofit AS ecnonprofit,
o.ec_researchorganization AS ecresearchorganization,
o.ec_highereducation AS echighereducation,
o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests,
o.ec_internationalorganization AS ecinternationalorganization,
o.ec_enterprise AS ecenterprise,
o.ec_smevalidated AS ecsmevalidated,
o.ec_nutscode AS ecnutscode,
o.dateofcollection AS dateofcollection,
o.lastupdate AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
o.trust AS trust,
'' AS inferenceprovenance,
d.id AS collectedfromid,
d.officialname AS collectedfromname,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
ARRAY[]::text[] AS pid
FROM dsm_organizations o
LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)

View File

@ -0,0 +1,53 @@
SELECT
o.id AS organizationid,
coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname,
o.name AS legalname,
array_agg(DISTINCT n.name) AS "alternativeNames",
(array_agg(u.url))[1] AS websiteurl,
o.modification_date AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
0.95 AS trust,
'' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM organizations o
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
LEFT OUTER JOIN urls u ON (u.id = o.id)
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
LEFT OUTER JOIN other_names n ON (n.id = o.id)
GROUP BY
o.id,
o.name,
o.modification_date,
o.country
UNION ALL
SELECT
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid,
n.name AS legalshortname,
n.name AS legalname,
ARRAY[]::text[] AS "alternativeNames",
(array_agg(u.url))[1] AS websiteurl,
o.modification_date AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
0.88 AS trust,
'' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM other_names n
LEFT OUTER JOIN organizations o ON (n.id = o.id)
LEFT OUTER JOIN urls u ON (u.id = o.id)
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
GROUP BY
o.id, o.modification_date, o.country, n.name

View File

@ -0,0 +1,19 @@
SELECT
po.project AS project,
po.resporganization AS resporganization,
po.participantnumber AS participantnumber,
po.contribution AS contribution,
NULL AS startdate,
NULL AS enddate,
false AS inferred,
false AS deletedbyinference,
po.trust AS trust,
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction
FROM project_organization po
LEFT OUTER JOIN projects p ON (p.id = po.project)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)

View File

@ -0,0 +1,90 @@
SELECT
p.id AS projectid,
p.code AS code,
p.websiteurl AS websiteurl,
p.acronym AS acronym,
p.title AS title,
p.startdate AS startdate,
p.enddate AS enddate,
p.call_identifier AS callidentifier,
p.keywords AS keywords,
p.duration AS duration,
p.ec_sc39 AS ecsc39,
p.oa_mandate_for_publications AS oamandatepublications,
p.ec_article29_3 AS ecarticle29_3,
p.dateofcollection AS dateofcollection,
p.lastupdate AS dateoftransformation,
p.inferred AS inferred,
p.deletedbyinference AS deletedbyinference,
p.trust AS trust,
p.inferenceprovenance AS inferenceprovenance,
p.optional1 AS optional1,
p.optional2 AS optional2,
p.jsonextrainfo AS jsonextrainfo,
p.contactfullname AS contactfullname,
p.contactfax AS contactfax,
p.contactphone AS contactphone,
p.contactemail AS contactemail,
p.summary AS summary,
p.currency AS currency,
p.totalcost AS totalcost,
p.fundedamount AS fundedamount,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype,
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree
FROM projects p
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass)
LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme)
GROUP BY
p.id,
p.code,
p.websiteurl,
p.acronym,
p.title,
p.startdate,
p.enddate,
p.call_identifier,
p.keywords,
p.duration,
p.ec_sc39,
p.oa_mandate_for_publications,
p.ec_article29_3,
p.dateofcollection,
p.inferred,
p.deletedbyinference,
p.trust,
p.inferenceprovenance,
p.contactfullname,
p.contactfax,
p.contactphone,
p.contactemail,
p.summary,
p.currency,
p.totalcost,
p.fundedamount,
dc.id,
dc.officialname,
pac.code, pac.name, pas.code, pas.name,
ctc.code, ctc.name, cts.code, cts.name;

View File

@ -0,0 +1,17 @@
SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar'
UNION ALL
SELECT
o.id AS id1,
'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2
FROM acronyms a
LEFT OUTER JOIN organizations o ON (a.id = o.id)
UNION ALL
SELECT
o.id AS id1,
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2
FROM other_names n
LEFT OUTER JOIN organizations o ON (n.id = o.id)

View File

@ -0,0 +1,9 @@
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

View File

@ -9,6 +9,37 @@
<artifactId>dhp-dedup</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>

View File

@ -1,10 +1,18 @@
package eu.dnetlib.dhp.graph;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.Map;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
public class GraphMappingUtils {
public final static Map<String, Class> types = Maps.newHashMap();

View File

@ -4,12 +4,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
public class SparkGraphImporterJob {

View File

@ -10,9 +10,8 @@ This module is automatically executed when running:
on module having set:
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-wf</artifactId>
<version>1.0.0-SNAPSHOT</version>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
</parent>
in `pom.xml` file. `oozie-package` profile initializes oozie workflow packaging, `workflow.source.dir` property points to a workflow (notice: this is not a relative path but a classpath to directory usually holding `oozie_app` subdirectory).

36
pom.xml
View File

@ -1,4 +1,6 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib.dhp</groupId>
@ -296,6 +298,11 @@
<artifactId>secondstring</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>${mongodb.driver.version}</version>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
@ -419,31 +426,7 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
<extensions>
@ -491,6 +474,7 @@
<dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<scala.version>2.11.12</scala.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
</properties>
</project>