Fork 0

Merge branch 'dump' of https://code-repo.d4science.org/miriam.baglioni/dnet-hadoop into resolve_conflicts_pr40_dump

This commit is contained in:
Claudio Atzori 2020-08-14 15:32:29 +02:00
commit 5b994d7ccf
156 changed files with 13802 additions and 36 deletions

View File

@ -87,6 +87,11 @@

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.common.api;
import java.io.IOException;
import java.io.InputStream;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.internal.Util;
import okio.BufferedSink;
import okio.Okio;
import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private InputStream inputStream;
private MediaType mediaType;
private long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
return new InputStreamRequestBody(inputStream, mediaType, len);
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
this.inputStream = inputStream;
this.mediaType = mediaType;
this.lenght = len;
public MediaType contentType() {
return mediaType;
public long contentLength() {
return lenght;
public void writeTo(BufferedSink sink) throws IOException {
Source source = null;
try {
source = Okio.source(inputStream);
} finally {

View File

@ -0,0 +1,7 @@
package eu.dnetlib.dhp.common.api;
public class MissingConceptDoiException extends Throwable {
public MissingConceptDoiException(String message) {

View File

@ -0,0 +1,266 @@
package eu.dnetlib.dhp.common.api;
import java.io.*;
import java.io.IOException;
import com.google.gson.Gson;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
import okhttp3.*;
public class ZenodoAPIClient implements Serializable {
String urlString;
String bucket;
String deposition_id;
String access_token;
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
public String getUrlString() {
return urlString;
public void setUrlString(String urlString) {
this.urlString = urlString;
public String getBucket() {
return bucket;
public void setBucket(String bucket) {
this.bucket = bucket;
public void setDeposition_id(String deposition_id){this.deposition_id = deposition_id;}
public ZenodoAPIClient(String urlString, String access_token) throws IOException {
this.urlString = urlString;
this.access_token = access_token;
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
* @return response code
* @throws IOException
public int newDeposition() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient();
RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, json);
Request request = new Request.Builder()
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
// Get response body
json = response.body().string();
ZenodoModel newSubmission = new Gson().fromJson(json, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return response.code();
* Upload files in Zenodo.
* @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo
* @param len the size of the file
* @return the response code
public int uploadIS(InputStream is, String file_name, long len) throws IOException {
OkHttpClient httpClient = new OkHttpClient();
Request request = new Request.Builder()
.url(bucket + "/" + file_name)
.addHeader("Content-Type", "application/zip") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len))
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
* Associates metadata information to the current deposition
* @param metadata the metadata
* @return response code
* @throws IOException
public int sendMretadata(String metadata) throws IOException {
OkHttpClient httpClient = new OkHttpClient();
RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, metadata);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
* To publish the current deposition. It works for both new deposition or new version of an old deposition
* @return response code
* @throws IOException
public int publish() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient();
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/publish")
.addHeader("Authorization", "Bearer " + access_token)
.post(RequestBody.create(MEDIA_TYPE_JSON, json))
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
* To create a new version of an already published deposition.
* It sets the deposition_id and the bucket to be used for the new version.
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is
* the last part of the url for the DOI Zenodo suggests to use to cite all versions:
* DOI: 10.xxx/zenodo.656930 concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient();
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/newversion")
.addHeader("Authorization", "Bearer " + access_token)
.post(RequestBody.create(MEDIA_TYPE_JSON, json))
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return response.code();
private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException {
ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class);
for(ZenodoModel zm : zenodoModelList){
if (zm.getConceptrecid().equals(concept_rec_id)){
deposition_id = zm.getId();
throw new MissingConceptDoiException("The concept record id specified was missing in the list of depositions");
private String getPrevDepositions() throws IOException {
OkHttpClient httpClient = new OkHttpClient();
Request request = new Request.Builder()
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.body().string();
private String getBucket(String url) throws IOException {
OkHttpClient httpClient = new OkHttpClient();
Request request = new Request.Builder()
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
// Get response body
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
return zenodoModel.getLinks().getBucket();

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Community {
private String identifier;
public String getIdentifier() {
return identifier;
public void setIdentifier(String identifier) {
this.identifier = identifier;

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.common.api.zenodo;
public class Creator {
private String affiliation;
private String name;
private String orcid;
public String getAffiliation() {
return affiliation;
public void setAffiliation(String affiliation) {
this.affiliation = affiliation;
public String getName() {
return name;
public void setName(String name) {
this.name = name;
public String getOrcid() {
return orcid;
public void setOrcid(String orcid) {
this.orcid = orcid;
public static Creator newInstance(String name, String affiliation, String orcid) {
Creator c = new Creator();
if (!(name == null)) {
c.name = name;
if (!(affiliation == null)) {
c.affiliation = affiliation;
if (!(orcid == null)) {
c.orcid = orcid;
return c;

View File

@ -0,0 +1,58 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import net.minidev.json.annotate.JsonIgnore;
public class File implements Serializable {
private String checksum;
private String filename;
private long filesize;
private String id;
// private Links links;
public String getChecksum() {
return checksum;
public void setChecksum(String checksum) {
this.checksum = checksum;
public String getFilename() {
return filename;
public void setFilename(String filename) {
this.filename = filename;
public long getFilesize() {
return filesize;
public void setFilesize(long filesize) {
this.filesize = filesize;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
// @JsonIgnore
// public Links getLinks() {
// return links;
// }
// @JsonIgnore
// public void setLinks(Links links) {
// this.links = links;
// }

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Grant implements Serializable {
private String id;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public static Grant newInstance(String id) {
Grant g = new Grant();
g.id = id;
return g;

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class Links implements Serializable {
private String bucket;
private String discard;
private String edit;
private String files;
private String html;
private String latest_draft;
private String latest_draft_html;
private String publish;
private String self;
public String getBucket() {
return bucket;
public void setBucket(String bucket) {
this.bucket = bucket;
public String getDiscard() {
return discard;
public void setDiscard(String discard) {
this.discard = discard;
public String getEdit() {
return edit;
public void setEdit(String edit) {
this.edit = edit;
public String getFiles() {
return files;
public void setFiles(String files) {
this.files = files;
public String getHtml() {
return html;
public void setHtml(String html) {
this.html = html;
public String getLatest_draft() {
return latest_draft;
public void setLatest_draft(String latest_draft) {
this.latest_draft = latest_draft;
public String getLatest_draft_html() {
return latest_draft_html;
public void setLatest_draft_html(String latest_draft_html) {
this.latest_draft_html = latest_draft_html;
public String getPublish() {
return publish;
public void setPublish(String publish) {
this.publish = publish;
public String getSelf() {
return self;
public void setSelf(String self) {
this.self = self;

View File

@ -0,0 +1,153 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class Metadata implements Serializable {
private String access_right;
private List<Community> communities;
private List<Creator> creators;
private String description;
private String doi;
private List<Grant> grants;
private List<String> keywords;
private String language;
private String license;
private PrereserveDoi prereserve_doi;
private String publication_date;
private List<String> references;
private List<RelatedIdentifier> related_identifiers;
private String title;
private String upload_type;
private String version;
public String getUpload_type() {
return upload_type;
public void setUpload_type(String upload_type) {
this.upload_type = upload_type;
public String getVersion() {
return version;
public void setVersion(String version) {
this.version = version;
public String getAccess_right() {
return access_right;
public void setAccess_right(String access_right) {
this.access_right = access_right;
public List<Community> getCommunities() {
return communities;
public void setCommunities(List<Community> communities) {
this.communities = communities;
public List<Creator> getCreators() {
return creators;
public void setCreators(List<Creator> creators) {
this.creators = creators;
public String getDescription() {
return description;
public void setDescription(String description) {
this.description = description;
public String getDoi() {
return doi;
public void setDoi(String doi) {
this.doi = doi;
public List<Grant> getGrants() {
return grants;
public void setGrants(List<Grant> grants) {
this.grants = grants;
public List<String> getKeywords() {
return keywords;
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
public String getLanguage() {
return language;
public void setLanguage(String language) {
this.language = language;
public String getLicense() {
return license;
public void setLicense(String license) {
this.license = license;
public PrereserveDoi getPrereserve_doi() {
return prereserve_doi;
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
this.prereserve_doi = prereserve_doi;
public String getPublication_date() {
return publication_date;
public void setPublication_date(String publication_date) {
this.publication_date = publication_date;
public List<String> getReferences() {
return references;
public void setReferences(List<String> references) {
this.references = references;
public List<RelatedIdentifier> getRelated_identifiers() {
return related_identifiers;
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
this.related_identifiers = related_identifiers;
public String getTitle() {
return title;
public void setTitle(String title) {
this.title = title;

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class PrereserveDoi implements Serializable {
private String doi;
private String recid;
public String getDoi() {
return doi;
public void setDoi(String doi) {
this.doi = doi;
public String getRecid() {
return recid;
public void setRecid(String recid) {
this.recid = recid;

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
public class RelatedIdentifier implements Serializable {
private String identifier;
private String relation;
private String resource_type;
private String scheme;
public String getIdentifier() {
return identifier;
public void setIdentifier(String identifier) {
this.identifier = identifier;
public String getRelation() {
return relation;
public void setRelation(String relation) {
this.relation = relation;
public String getResource_type() {
return resource_type;
public void setResource_type(String resource_type) {
this.resource_type = resource_type;
public String getScheme() {
return scheme;
public void setScheme(String scheme) {
this.scheme = scheme;

View File

@ -0,0 +1,118 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import java.util.List;
public class ZenodoModel implements Serializable {
private String conceptrecid;
private String created;
private List<File> files;
private String id;
private Links links;
private Metadata metadata;
private String modified;
private String owner;
private String record_id;
private String state;
private boolean submitted;
private String title;
public String getConceptrecid() {
return conceptrecid;
public void setConceptrecid(String conceptrecid) {
this.conceptrecid = conceptrecid;
public String getCreated() {
return created;
public void setCreated(String created) {
this.created = created;
public List<File> getFiles() {
return files;
public void setFiles(List<File> files) {
this.files = files;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public Links getLinks() {
return links;
public void setLinks(Links links) {
this.links = links;
public Metadata getMetadata() {
return metadata;
public void setMetadata(Metadata metadata) {
this.metadata = metadata;
public String getModified() {
return modified;
public void setModified(String modified) {
this.modified = modified;
public String getOwner() {
return owner;
public void setOwner(String owner) {
this.owner = owner;
public String getRecord_id() {
return record_id;
public void setRecord_id(String record_id) {
this.record_id = record_id;
public String getState() {
return state;
public void setState(String state) {
this.state = state;
public boolean isSubmitted() {
return submitted;
public void setSubmitted(boolean submitted) {
this.submitted = submitted;
public String getTitle() {
return title;
public void setTitle(String title) {
this.title = title;

View File

@ -0,0 +1,6 @@
package eu.dnetlib.dhp.common.api.zenodo;
import java.util.ArrayList;
public class ZenodoModelList extends ArrayList<ZenodoModel> {

View File

@ -0,0 +1,85 @@
package eu.dnetlib.dhp.common.api;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
public class ZenodoAPIClientTest {
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
private final String ACCESS_TOKEN = "";
private final String CONCEPT_REC_ID = "657113";
public void testNewDeposition() throws IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
Assertions.assertEquals(201, client.newDeposition());
File file = new File(getClass()
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
public void testNewVersionNewName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
File file = new File(getClass()
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition", file.length()));
Assertions.assertEquals(202, client.publish());
public void testNewVersionOldName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
File file = new File(getClass()
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition", file.length()));
Assertions.assertEquals(202, client.publish());

View File

@ -0,0 +1 @@
{"metadata":{"access_right":"open","communities":[{"identifier":"openaire-research-graph"}],"creators":[{"affiliation":"ISTI - CNR","name":"Bardi, Alessia","orcid":"0000-0002-1112-1292"},{"affiliation":"eifl", "name":"Kuchma, Iryna"},{"affiliation":"BIH", "name":"Brobov, Evgeny"},{"affiliation":"GIDIF RBM", "name":"Truccolo, Ivana"},{"affiliation":"unesp", "name":"Monteiro, Elizabete"},{"affiliation":"und", "name":"Casalegno, Carlotta"},{"affiliation":"CARL ABRC", "name":"Clary, Erin"},{"affiliation":"The University of Edimburgh", "name":"Romanowski, Andrew"},{"affiliation":"ISTI - CNR", "name":"Pavone, Gina"},{"affiliation":"ISTI - CNR", "name":"Artini, Michele"},{"affiliation":"ISTI - CNR","name":"Atzori, Claudio","orcid":"0000-0001-9613-6639"},{"affiliation":"University of Bielefeld","name":"Bäcker, Amelie","orcid":"0000-0001-6015-2063"},{"affiliation":"ISTI - CNR","name":"Baglioni, Miriam","orcid":"0000-0002-2273-9004"},{"affiliation":"University of Bielefeld","name":"Czerniak, Andreas","orcid":"0000-0003-3883-4169"},{"affiliation":"ISTI - CNR","name":"De Bonis, Michele"},{"affiliation":"Athena Research and Innovation Centre","name":"Dimitropoulos, Harry"},{"affiliation":"Athena Research and Innovation Centre","name":"Foufoulas, Ioannis"},{"affiliation":"University of Warsaw","name":"Horst, Marek"},{"affiliation":"Athena Research and Innovation Centre","name":"Iatropoulou, Katerina"},{"affiliation":"University of Warsaw","name":"Jacewicz, Przemyslaw"},{"affiliation":"Athena Research and Innovation Centre","name":"Kokogiannaki, Argiro", "orcid":"0000-0002-3880-0244"},{"affiliation":"ISTI - CNR","name":"La Bruzzo, Sandro","orcid":"0000-0003-2855-1245"},{"affiliation":"ISTI - CNR","name":"Lazzeri, Emma"},{"affiliation":"University of Bielefeld","name":"Löhden, Aenne"},{"affiliation":"ISTI - CNR","name":"Manghi, Paolo","orcid":"0000-0001-7291-3210"},{"affiliation":"ISTI - CNR","name":"Mannocci, Andrea","orcid":"0000-0002-5193-7851"},{"affiliation":"Athena Research and Innovation Center","name":"Manola, Natalia"},{"affiliation":"ISTI - CNR","name":"Ottonello, Enrico"},{"affiliation":"University of Bielefeld","name":"Shirrwagen, Jochen"}],"description":"\\u003cp\\u003eThis dump provides access to the metadata records of publications, research data, software and projects that may be relevant to the Corona Virus Disease (COVID-19) fight. The dump contains records of the OpenAIRE COVID-19 Gateway (https://covid-19.openaire.eu/), identified via full-text mining and inference techniques applied to the OpenAIRE Research Graph (https://explore.openaire.eu/). The Graph is one of the largest Open Access collections of metadata records and links between publications, datasets, software, projects, funders, and organizations, aggregating 12,000+ scientific data sources world-wide, among which the Covid-19 data sources Zenodo COVID-19 Community, WHO (World Health Organization), BIP! FInder for COVID-19, Protein Data Bank, Dimensions, scienceOpen, and RSNA. \\u003cp\\u003eThe dump consists of a gzip file containing one json per line. Each json is compliant to the schema available at https://doi.org/10.5281/zenodo.3974226\\u003c/p\\u003e ","title":"OpenAIRE Covid-19 publications, datasets, software and projects metadata.","upload_type":"dataset","version":"1.0"}}

View File

@ -0,0 +1 @@
This is a test for a new deposition

View File

@ -0,0 +1 @@
This is a test for a new version of an old deposition

View File

@ -0,0 +1,2 @@
This is a test for a new version of an old deposition. This should replace the other new version. I expect to have only two
files in the deposition

View File

@ -79,6 +79,15 @@ public class ModelSupport {
entityIdPrefix.put("result", "50");
public static final Map<String, String> idPrefixEntity = Maps.newHashMap();
static {
idPrefixEntity.put("10", "datasource");
idPrefixEntity.put("20", "organization");
idPrefixEntity.put("40", "project");
idPrefixEntity.put("50", "result");
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
static {

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
* Used to refer to the Article Processing Charge information. Not dumped in this release. It contains two parameters: -
* currency of type String to store the currency of the APC - amount of type String to stores the charged amount
public class APC implements Serializable {
private String currency;
private String amount;
public String getCurrency() {
return currency;
public void setCurrency(String currency) {
this.currency = currency;
public String getAmount() {
return amount;
public void setAmount(String amount) {
this.amount = amount;

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dhp.schema.dump.oaf;
* AccessRight. Used to represent the result access rights. It extends the eu.dnet.lib.dhp.schema.dump.oaf.Qualifier
* element with a parameter scheme of type String to store the scheme. Values for this element are found against the
* COAR access right scheme. The classid of the element accessright in eu.dnetlib.dhp.schema.oaf.Result is used to get
* the COAR corresponding code whose value will be used to set the code parameter. The COAR label corresponding to the
* COAR code will be used to set the label parameter. The scheme value will always be the one referring to the COAR
* access right scheme
public class AccessRight extends Qualifier {
private String scheme;
public String getScheme() {
return scheme;
public void setScheme(String scheme) {
this.scheme = scheme;
public static AccessRight newInstance(String code, String label, String scheme) {
AccessRight ar = new AccessRight();
return ar;

View File

@ -0,0 +1,73 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
import java.util.List;
* Used to represent the generic author of the result. It has six parameters: - name of type String to store the given
* name of the author. The value for this parameter corresponds to eu.dnetlib.dhp.schema.oaf.Author name - surname of
* type String to store the family name of the author. The value for this parameter corresponds to
* eu.dnetlib.dhp.schema.oaf.Author surname - fullname of type String to store the fullname of the author. The value for
* this parameter corresponds to eu.dnetlib.dhp.schema.oaf.Author fullname - rank of type Integer to store the rank on
* the author in the result's authors list. The value for this parameter corresponds to eu.dnetlib.dhp.schema.oaf.Author
* rank - pid of type eu.dnetlib.dhp.schema.dump.oaf.Pid to store the persistent identifier for the author. For the
* moment only ORCID identifiers will be dumped. - The id element is instantiated by using the following values in the
* eu.dnetlib.dhp.schema.oaf.Result pid: * Qualifier.classid for scheme * value for value - The provenance element is
* instantiated only if the dataInfo is set for the pid in the result to be dumped. The provenance element is
* instantiated by using the following values in the eu.dnetlib.dhp.schema.oaf.Result pid: *
* dataInfo.provenanceaction.classname for provenance * dataInfo.trust for trust
public class Author implements Serializable {
private String fullname;
private String name;
private String surname;
private Integer rank;
private Pid pid;
public String getFullname() {
return fullname;
public void setFullname(String fullname) {
this.fullname = fullname;
public String getName() {
return name;
public void setName(String name) {
this.name = name;
public String getSurname() {
return surname;
public void setSurname(String surname) {
this.surname = surname;
public Integer getRank() {
return rank;
public void setRank(Integer rank) {
this.rank = rank;
public Pid getPid() {
return pid;
public void setPid(Pid pid) {
this.pid = pid;

View File

@ -0,0 +1,136 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
import java.util.Objects;
* To store information about the conference or journal where the result has been presented or published. It contains
* eleven parameters: - name of type String to store the name of the journal or conference. It corresponds to the
* parameter name of eu.dnetlib.dhp.schema.oaf.Journal - issnPrinted ot type String to store the journal printed issn.
* It corresponds to the parameter issnPrinted of eu.dnetlib.dhp.schema.oaf.Journal - issnOnline of type String to store
* the journal online issn. It corresponds to the parameter issnOnline of eu.dnetlib.dhp.schema.oaf.Journal -
* issnLinking of type String to store the journal linking issn. It corresponds to the parameter issnLinking of
* eu.dnetlib.dhp.schema.oaf.Journal - ep of type String to store the end page. It corresponds to the parameter ep of
* eu.dnetlib.dhp.schema.oaf.Journal - iss of type String to store the journal issue. It corresponds to the parameter
* iss of eu.dnetlib.dhp.schema.oaf.Journal - sp of type String to store the start page. It corresponds to the parameter
* sp of eu.dnetlib.dhp.schema.oaf.Journal - vol of type String to store the Volume. It corresponds to the parameter vol
* of eu.dnetlib.dhp.schema.oaf.Journal - edition of type String to store the edition of the journal or conference
* proceeding. It corresponds to the parameter edition of eu.dnetlib.dhp.schema.oaf.Journal - conferenceplace of type
* String to store the place of the conference. It corresponds to the parameter conferenceplace of
* eu.dnetlib.dhp.schema.oaf.Journal - conferencedate of type String to store the date of the conference. It corresponds
* to the parameter conferencedate of eu.dnetlib.dhp.schema.oaf.Journal
public class Container implements Serializable {
private String name;
private String issnPrinted;
private String issnOnline;
private String issnLinking;
private String ep;
private String iss;
private String sp;
private String vol;
private String edition;
private String conferenceplace;
private String conferencedate;
public String getName() {
return name;
public void setName(String name) {
this.name = name;
public String getIssnPrinted() {
return issnPrinted;
public void setIssnPrinted(String issnPrinted) {
this.issnPrinted = issnPrinted;
public String getIssnOnline() {
return issnOnline;
public void setIssnOnline(String issnOnline) {
this.issnOnline = issnOnline;
public String getIssnLinking() {
return issnLinking;
public void setIssnLinking(String issnLinking) {
this.issnLinking = issnLinking;
public String getEp() {
return ep;
public void setEp(String ep) {
this.ep = ep;
public String getIss() {
return iss;
public void setIss(String iss) {
this.iss = iss;
public String getSp() {
return sp;
public void setSp(String sp) {
this.sp = sp;
public String getVol() {
return vol;
public void setVol(String vol) {
this.vol = vol;
public String getEdition() {
return edition;
public void setEdition(String edition) {
this.edition = edition;
public String getConferenceplace() {
return conferenceplace;
public void setConferenceplace(String conferenceplace) {
this.conferenceplace = conferenceplace;
public String getConferencedate() {
return conferencedate;
public void setConferencedate(String conferencedate) {
this.conferencedate = conferencedate;

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
* To represent the information described by a scheme and a value in that scheme (i.e. pid). It has two parameters: -
* scheme of type String to store the scheme - value of type String to store the value in that scheme
public class ControlledField implements Serializable {
private String scheme;
private String value;
public String getScheme() {
return scheme;
public void setScheme(String scheme) {
this.scheme = scheme;
public String getValue() {
return value;
public void setValue(String value) {
this.value = value;
public static ControlledField newInstance(String scheme, String value) {
ControlledField cf = new ControlledField();
return cf;

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.schema.dump.oaf;
* Represents the country associated to this result. It extends eu.dnetlib.dhp.schema.dump.oaf.Qualifier with a
* provenance parameter of type eu.dnetlib.dhp.schema.dumo.oaf.Provenance. The country in not mapped if its value in the
* result reprensented in the internal format is Unknown. The value for this element correspond to: - code corresponds
* to the classid of eu.dnetlib.dhp.schema.oaf.Country - label corresponds to the classname of
* eu.dnetlib.dhp.schema.oaf.Country - provenance set only if the dataInfo associated to the Country of the result to be
* dumped is not null. In this case : - provenance corresponds to dataInfo.provenanceaction.classid (to be modified with
* datainfo.provenanceaction.classname) - trust corresponds to dataInfo.trust
public class Country extends Qualifier {
private Provenance provenance;
public Provenance getProvenance() {
return provenance;
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
public static Country newInstance(String code, String label, Provenance provenance) {
Country c = new Country();
return c;
public static Country newInstance(String code, String label, String provenance, String trust) {
return newInstance(code, label, Provenance.newInstance(provenance, trust));

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
public class Funder implements Serializable {
private String shortName;
private String name;
private String jurisdiction;
public String getJurisdiction() {
return jurisdiction;
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
public String getShortName() {
return shortName;
public void setShortName(String shortName) {
this.shortName = shortName;
public String getName() {
return name;
public void setName(String name) {
this.name = name;

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
* Represents the geolocation information. It has three parameters: - point of type String to store the point
* information. It corresponds to eu.dnetlib.dhp.schema.oaf.GeoLocation point - box ot type String to store the box
* information. It corresponds to eu.dnetlib.dhp.schema.oaf.GeoLocation box - place of type String to store the place
* information. It corresponds to eu.dnetlib.dhp.schema.oaf.GeoLocation place
public class GeoLocation implements Serializable {
private String point;
private String box;
private String place;
public String getPoint() {
return point;
public void setPoint(String point) {
this.point = point;
public String getBox() {
return box;
public void setBox(String box) {
this.box = box;
public String getPlace() {
return place;
public void setPlace(String place) {
this.place = place;
public boolean isBlank() {
return StringUtils.isBlank(point) && StringUtils.isBlank(box) && StringUtils.isBlank(place);

View File

@ -0,0 +1,107 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
import java.util.List;
* Represents the manifestations (i.e. different versions) of the result. For example: the pre-print and the published
* versions are two manifestations of the same research result. It has the following parameters: - license of type
* String to store the license applied to the instance. It corresponds to the value of the licence in the instance to be
* dumped - accessright of type eu.dnetlib.dhp.schema.dump.oaf.AccessRight to store the accessright of the instance. -
* type of type String to store the type of the instance as defined in the corresponding dnet vocabulary
* (dnet:pubication_resource). It corresponds to the instancetype.classname of the instance to be mapped - hostedby of
* type eu.dnetlib.dhp.schema.dump.oaf.KeyValue to store the information about the source from which the instance can be
* viewed or downloaded. It is mapped against the hostedby parameter of the instance to be dumped and - key corresponds
* to hostedby.key - value corresponds to hostedby.value - url of type List<String> list of locations where the instance
* is accessible. It corresponds to url of the instance to be dumped - collectedfrom of type
* eu.dnetlib.dhp.schema.dump.oaf.KeyValue to store the information about the source from which the instance has been
* collected. It is mapped against the collectedfrom parameter of the instance to be dumped and - key corresponds to
* collectedfrom.key - value corresponds to collectedfrom.value - publicationdate of type String to store the
* publication date of the instance ;// dateofacceptance; - refereed of type String to store information abour tthe
* review status of the instance. Possible values are 'Unknown', 'nonPeerReviewed', 'peerReviewed'. It corresponds to
* refereed.classname of the instance to be dumped
public class Instance implements Serializable {
private String license;
private AccessRight accessright;
private String type;
private KeyValue hostedby;
private List<String> url;
private KeyValue collectedfrom;
private String publicationdate;// dateofacceptance;
private String refereed; // peer-review status
public String getLicense() {
return license;
public void setLicense(String license) {
this.license = license;
public AccessRight getAccessright() {
return accessright;
public void setAccessright(AccessRight accessright) {
this.accessright = accessright;
public String getType() {
return type;
public void setType(String type) {
this.type = type;
public KeyValue getHostedby() {
return hostedby;
public void setHostedby(KeyValue hostedby) {
this.hostedby = hostedby;
public List<String> getUrl() {
return url;
public void setUrl(List<String> url) {
this.url = url;
public KeyValue getCollectedfrom() {
return collectedfrom;
public void setCollectedfrom(KeyValue collectedfrom) {
this.collectedfrom = collectedfrom;
public String getPublicationdate() {
return publicationdate;
public void setPublicationdate(String publicationdate) {
this.publicationdate = publicationdate;
public String getRefereed() {
return refereed;
public void setRefereed(String refereed) {
this.refereed = refereed;

View File

@ -0,0 +1,48 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
* To represent the information described by a key and a value. It has two parameters: - key to store the key (generally
* the OpenAIRE id for some entity) - value to store the value (generally the OpenAIRE name for the key)
public class KeyValue implements Serializable {
private String key;
private String value;
public String getKey() {
return key;
public void setKey(String key) {
this.key = key;
public String getValue() {
return value;
public void setValue(String value) {
this.value = value;
public static KeyValue newInstance(String key, String value) {
KeyValue inst = new KeyValue();
inst.key = key;
inst.value = value;
return inst;
public boolean isBlank() {
return StringUtils.isBlank(key) && StringUtils.isBlank(value);

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
* To represent the generic persistent identifier. It has two parameters: - id of type
* eu.dnetlib.dhp.schema.dump.oaf.ControlledField to store the scheme and value of the Persistent Identifier. -
* provenance of type eu.dnetlib.dhp.schema.dump.oaf.Provenance to store the provenance and trust of the information
public class Pid implements Serializable {
private ControlledField id;
private Provenance provenance;
public ControlledField getId() {
return id;
public void setId(ControlledField pid) {
this.id = pid;
public Provenance getProvenance() {
return provenance;
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
public static Pid newInstance(ControlledField pid, Provenance provenance) {
Pid p = new Pid();
p.id = pid;
p.provenance = provenance;
return p;
public static Pid newInstance(ControlledField pid) {
Pid p = new Pid();
p.id = pid;
return p;

View File

@ -0,0 +1,44 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
public class Project implements Serializable {
protected String id;// OpenAIRE id
protected String code;
protected String acronym;
protected String title;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public String getCode() {
return code;
public void setCode(String code) {
this.code = code;
public String getAcronym() {
return acronym;
public void setAcronym(String acronym) {
this.acronym = acronym;
public String getTitle() {
return title;
public void setTitle(String title) {
this.title = title;

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
* Indicates the process that produced (or provided) the information, and the trust associated to the information. It
* has two parameters: - provenance of type String to store the provenance of the information, - trust of type String to
* store the trust associated to the information
public class Provenance implements Serializable {
private String provenance;
private String trust;
public String getProvenance() {
return provenance;
public void setProvenance(String provenance) {
this.provenance = provenance;
public String getTrust() {
return trust;
public void setTrust(String trust) {
this.trust = trust;
public static Provenance newInstance(String provenance, String trust) {
Provenance p = new Provenance();
p.provenance = provenance;
p.trust = trust;
return p;
public String toString() {
return provenance + trust;

View File

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
* To represent the information described by a code and a value It has two parameters: - code to store the code
* (generally the classid of the eu.dnetlib.dhp.schema.oaf.Qualifier element) - label to store the label (generally the
* classname of the eu.dnetlib.dhp.schema.oaf.Qualifier element
public class Qualifier implements Serializable {
private String code; // the classid in the Qualifier
private String label; // the classname in the Qualifier
public String getCode() {
return code;
public void setCode(String code) {
this.code = code;
public String getLabel() {
return label;
public void setLabel(String label) {
this.label = label;
public static Qualifier newInstance(String code, String value) {
Qualifier qualifier = new Qualifier();
return qualifier;

View File

@ -0,0 +1,391 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
* To represent the dumped result. It will be extended in the dump for Research Communities - Research
* Initiative/Infrastructures. It has the following parameters: - author of type
* List<eu.dnetlib.dhpschema.dump.oaf.Author> to describe the authors of a result. For each author in the result
* represented in the internal model one author in the esternal model is produced. - type of type String to represent
* the category of the result. Possible values are publication, dataset, software, other. It corresponds to
* resulttype.classname of the dumped result - language of type eu.dnetlib.dhp.schema.dump.oaf.Qualifier to store
* information about the language of the result. It is dumped as - code corresponds to language.classid - value
* corresponds to language.classname - country of type List<eu.dnetlib.dhp.schema.dump.oaf.Country> to store the country
* list to which the result is associated. For each country in the result respresented in the internal model one country
* in the external model is produces - subjects of type List<eu.dnetlib.dhp.dump.oaf.Subject> to store the subjects for
* the result. For each subject in the result represented in the internal model one subject in the external model is
* produced - maintitle of type String to store the main title of the result. It corresponds to the value of the first
* title in the resul to be dumped having classid equals to "main title" - subtitle of type String to store the subtitle
* of the result. It corresponds to the value of the first title in the resul to be dumped having classid equals to
* "subtitle" - description of type List<String> to store the description of the result. It corresponds to the list of
* description.value in the result represented in the internal model - publicationdate of type String to store the
* pubblication date. It corresponds to dateofacceptance.value in the result represented in the internal model -
* publisher of type String to store information about the publisher. It corresponds to publisher.value of the result
* represented in the intrenal model - embargoenddate of type String to store the embargo end date. It corresponds to
* embargoenddate.value of the result represented in the internal model - source of type List<String> See definition of
* Dublin Core field dc:source. It corresponds to the list of source.value in the result represented in the internal
* model - format of type List<String> It corresponds to the list of format.value in the result represented in the
* internal model - contributor of type List<String> to represent contributors for this result. It corresponds to the
* list of contributor.value in the result represented in the internal model - coverage of type String. It corresponds
* to the list of coverage.value in the result represented in the internal model - bestaccessright of type
* eu.dnetlib.dhp.schema.dump.oaf.AccessRight to store informatin about the openest access right associated to the
* manifestations of this research results. It corresponds to the same parameter in the result represented in the
* internal model - instance of type List<eu.dnetlib.dhp.schema.dump.oaf.Instance> to store all the instances associated
* to the result. It corresponds to the same parameter in the result represented in the internal model - container of
* type eu.dnetlib.dhp.schema/dump.oaf.Container (only for result of type publication). It corresponds to the parameter
* journal of the result represented in the internal model - documentationUrl of type List<String> (only for results of
* type software) to store the URLs to the software documentation. It corresponds to the list of documentationUrl.value
* of the result represented in the internal model - codeRepositoryUrl of type String (only for results of type
* software) to store the URL to the repository with the source code. It corresponds to codeRepositoryUrl.value of the
* result represented in the internal model - programmingLanguage of type String (only for results of type software) to
* store the programming language. It corresponds to programmingLanguaga.classid of the result represented in the
* internal model - contactperson of type List<String> (only for results of type other) to store the contact person for
* this result. It corresponds to the list of contactperson.value of the result represented in the internal model -
* contactgroup of type List<String> (only for results of type other) to store the information for the contact group. It
* corresponds to the list of contactgroup.value of the result represented in the internal model - tool of type
* List<String> (only fro results of type other) to store information about tool useful for the interpretation and/or
* re-used of the research product. It corresponds to the list of tool.value in the result represented in the internal
* modelt - size of type String (only for results of type dataset) to store the size of the dataset. It corresponds to
* size.value in the result represented in the internal model - version of type String (only for results of type
* dataset) to store the version. It corresponds to version.value of the result represented in the internal model -
* geolocation fo type List<eu.dnetlib.dhp.schema.dump.oaf.GeoLocation> (only for results of type dataset) to store
* geolocation information. For each geolocation element in the result represented in the internal model a GeoLocation
* in the external model il produced - id of type String to store the OpenAIRE id of the result. It corresponds to the
* id of the result represented in the internal model - originalId of type List<String> to store the original ids of the
* result. It corresponds to the originalId of the result represented in the internal model - pid of type
* List<eu.dnetlib.dhp.schema.dump.oaf.ControlledField> to store the persistent identifiers for the result. For each pid
* in the results represented in the internal model one pid in the external model is produced. The value correspondence
* is: - scheme corresponds to pid.qualifier.classid of the result represented in the internal model - value corresponds
* to the pid.value of the result represented in the internal model - dateofcollection of type String to store
* information about the time OpenAIRE collected the record. It corresponds to dateofcollection of the result
* represented in the internal model - lasteupdatetimestamp of type String to store the timestamp of the last update of
* the record. It corresponds to lastupdatetimestamp of the resord represented in the internal model
public class Result implements Serializable {
private List<Author> author;
// resulttype allows subclassing results into publications | datasets | software
private String type; // resulttype
// common fields
private Qualifier language;
private List<Country> country;
private List<Subject> subjects;
private String maintitle;
private String subtitle;
private List<String> description;
private String publicationdate; // dateofacceptance;
private String publisher;
private String embargoenddate;
private List<String> source;
private List<String> format;
private List<String> contributor;
private List<String> coverage;
private AccessRight bestaccessright;
private List<Instance> instance;
private Container container;// Journal
private List<String> documentationUrl; // software
private String codeRepositoryUrl; // software
private String programmingLanguage; // software
private List<String> contactperson; // orp
private List<String> contactgroup; // orp
private List<String> tool; // orp
private String size; // dataset
private String version; // dataset
private List<GeoLocation> geolocation; // dataset
private String id;
private List<String> originalId;
private List<ControlledField> pid;
private String dateofcollection;
private Long lastupdatetimestamp;
public Long getLastupdatetimestamp() {
return lastupdatetimestamp;
public void setLastupdatetimestamp(Long lastupdatetimestamp) {
this.lastupdatetimestamp = lastupdatetimestamp;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public List<String> getOriginalId() {
return originalId;
public void setOriginalId(List<String> originalId) {
this.originalId = originalId;
public List<ControlledField> getPid() {
return pid;
public void setPid(List<ControlledField> pid) {
this.pid = pid;
public String getDateofcollection() {
return dateofcollection;
public void setDateofcollection(String dateofcollection) {
this.dateofcollection = dateofcollection;
public List<Author> getAuthor() {
return author;
public String getType() {
return type;
public void setType(String type) {
this.type = type;
public Container getContainer() {
return container;
public void setContainer(Container container) {
this.container = container;
public void setAuthor(List<Author> author) {
this.author = author;
public Qualifier getLanguage() {
return language;
public void setLanguage(Qualifier language) {
this.language = language;
public List<Country> getCountry() {
return country;
public void setCountry(List<Country> country) {
this.country = country;
public List<Subject> getSubjects() {
return subjects;
public void setSubjects(List<Subject> subjects) {
this.subjects = subjects;
public String getMaintitle() {
return maintitle;
public void setMaintitle(String maintitle) {
this.maintitle = maintitle;
public String getSubtitle() {
return subtitle;
public void setSubtitle(String subtitle) {
this.subtitle = subtitle;
public List<String> getDescription() {
return description;
public void setDescription(List<String> description) {
this.description = description;
public String getPublicationdate() {
return publicationdate;
public void setPublicationdate(String publicationdate) {
this.publicationdate = publicationdate;
public String getPublisher() {
return publisher;
public void setPublisher(String publisher) {
this.publisher = publisher;
public String getEmbargoenddate() {
return embargoenddate;
public void setEmbargoenddate(String embargoenddate) {
this.embargoenddate = embargoenddate;
public List<String> getSource() {
return source;
public void setSource(List<String> source) {
this.source = source;
public List<String> getFormat() {
return format;
public void setFormat(List<String> format) {
this.format = format;
public List<String> getContributor() {
return contributor;
public void setContributor(List<String> contributor) {
this.contributor = contributor;
public List<String> getCoverage() {
return coverage;
public void setCoverage(List<String> coverage) {
this.coverage = coverage;
public AccessRight getBestaccessright() {
return bestaccessright;
public void setBestaccessright(AccessRight bestaccessright) {
this.bestaccessright = bestaccessright;
public List<Instance> getInstance() {
return instance;
public void setInstance(List<Instance> instance) {
this.instance = instance;
public List<String> getDocumentationUrl() {
return documentationUrl;
public void setDocumentationUrl(List<String> documentationUrl) {
this.documentationUrl = documentationUrl;
public String getCodeRepositoryUrl() {
return codeRepositoryUrl;
public void setCodeRepositoryUrl(String codeRepositoryUrl) {
this.codeRepositoryUrl = codeRepositoryUrl;
public String getProgrammingLanguage() {
return programmingLanguage;
public void setProgrammingLanguage(String programmingLanguage) {
this.programmingLanguage = programmingLanguage;
public List<String> getContactperson() {
return contactperson;
public void setContactperson(List<String> contactperson) {
this.contactperson = contactperson;
public List<String> getContactgroup() {
return contactgroup;
public void setContactgroup(List<String> contactgroup) {
this.contactgroup = contactgroup;
public List<String> getTool() {
return tool;
public void setTool(List<String> tool) {
this.tool = tool;
public String getSize() {
return size;
public void setSize(String size) {
this.size = size;
public String getVersion() {
return version;
public void setVersion(String version) {
this.version = version;
public List<GeoLocation> getGeolocation() {
return geolocation;
public void setGeolocation(List<GeoLocation> geolocation) {
this.geolocation = geolocation;

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
* To represent keywords associated to the result. It has two parameters: - subject of type
* eu.dnetlib.dhp.schema.dump.oaf.ControlledField to describe the subject. It mapped as: - schema it corresponds to
* qualifier.classid of the dumped subject - value it corresponds to the subject value - provenance of type
* eu.dnetlib.dhp.schema.dump.oaf.Provenance to represent the provenance of the subject. It is dumped only if dataInfo
* is not null. In this case: - provenance corresponds to dataInfo.provenanceaction.classname - trust corresponds to
* dataInfo.trust
public class Subject implements Serializable {
private ControlledField subject;
private Provenance provenance;
public ControlledField getSubject() {
return subject;
public void setSubject(ControlledField subject) {
this.subject = subject;
public Provenance getProvenance() {
return provenance;
public void setProvenance(Provenance provenance) {
this.provenance = provenance;

View File

@ -0,0 +1,51 @@
package eu.dnetlib.dhp.schema.dump.oaf.community;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
* extends eu.dnetlib.dhp.schema.dump.oaf.Result with the following parameters: - projects of type
* List<eu.dnetlib.dhp.schema.dump.oaf.community.Project> to store the list of projects related to the result. The
* information is added after the result is mapped to the external model - context of type
* List<eu.dnetlib.dhp.schema/dump.oaf.community.Context> to store information about the RC RI related to the result.
* For each context in the result represented in the internal model one context in the external model is produced -
* collectedfrom of type List<eu.dnetliv.dhp.schema.dump.oaf.KeyValue> to store information about the sources from which
* the record has been collected. For each collectedfrom in the result represented in the internal model one
* collectedfrom in the external model is produced
public class CommunityResult extends Result {
private List<Project> projects;
private List<Context> context;
protected List<KeyValue> collectedfrom;
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
public void setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
public List<Project> getProjects() {
return projects;
public void setProjects(List<Project> projects) {
this.projects = projects;
public List<Context> getContext() {
return context;
public void setContext(List<Context> context) {
this.context = context;

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.schema.dump.oaf.community;
import java.util.List;
import java.util.Objects;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.Qualifier;
* Reference to a relevant research infrastructure, initiative or community (RI/RC) among those collaborating with
* OpenAIRE. It extend eu.dnetlib.dhp.shema.dump.oaf.Qualifier with a parameter provenance of type
* List<eu.dnetlib.dhp.schema.dump.oaf.Provenance> to store the provenances of the association between the result and
* the RC/RI. The values for this element correspond to: - code: it corresponds to the id of the context in the result
* to be mapped. If the context id refers to a RC/RI and contains '::' only the part of the id before the first "::"
* will be used as value for code - label it corresponds to the label associated to the id. The information id taken
* from the profile of the RC/RI - provenance it is set only if the dataInfo associated to the contenxt element of the
* result to be dumped is not null. For each dataInfo one instance of type eu.dnetlib.dhp.schema.dump.oaf.Provenance is
* instantiated if the element datainfo.provenanceaction is not null. In this case - provenance corresponds to
* dataInfo.provenanceaction.classname - trust corresponds to dataInfo.trust
public class Context extends Qualifier {
private List<Provenance> provenance;
public List<Provenance> getProvenance() {
return provenance;
public void setProvenance(List<Provenance> provenance) {
this.provenance = provenance;
public int hashCode() {
String provenance = new String();
this.provenance.forEach(p -> provenance.concat(p.toString()));
return Objects.hash(getCode(), getLabel(), provenance);

View File

@ -0,0 +1,52 @@
package eu.dnetlib.dhp.schema.dump.oaf.community;
import java.io.Serializable;
* To store information about the funder funding the project related to the result. It has the following parameters: -
* shortName of type String to store the funder short name (e.c. AKA). - name of type String to store the funder name
* (e.c. Akademy of Finland) - fundingStream of type String to store the funding stream - jurisdiction of type String to
* store the jurisdiction of the funder
public class Funder implements Serializable {
private String shortName;
private String name;
private String fundingStream;
private String jurisdiction;
public String getJurisdiction() {
return jurisdiction;
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
public String getShortName() {
return shortName;
public void setShortName(String shortName) {
this.shortName = shortName;
public String getName() {
return name;
public void setName(String name) {
this.name = name;
public String getFundingStream() {
return fundingStream;
public void setFundingStream(String fundingStream) {
this.fundingStream = fundingStream;

View File

@ -0,0 +1,88 @@
package eu.dnetlib.dhp.schema.dump.oaf.community;
import java.io.Serializable;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
* To store information about the project related to the result. This information is not directly mapped from the result
* represented in the internal model because it is not there. The mapped result will be enriched with project
* information derived by relation between results and projects. Project class has the following parameters: - id of
* type String to store the OpenAIRE id for the Project - code of type String to store the grant agreement - acronym of
* type String to store the acronym for the project - title of type String to store the title of the project - funder of
* type eu.dnetlib.dhp.schema.dump.oaf.community.Funder to store information about the funder funding the project -
* provenance of type eu.dnetlib.dhp.schema.dump.oaf.Provenance to store information about the. provenance of the
* association between the result and the project
public class Project implements Serializable {
private String id;// OpenAIRE id
private String code;
private String acronym;
private String title;
private Funder funder;
private Provenance provenance;
public Provenance getProvenance() {
return provenance;
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public String getCode() {
return code;
public void setCode(String code) {
this.code = code;
public String getAcronym() {
return acronym;
public void setAcronym(String acronym) {
this.acronym = acronym;
public String getTitle() {
return title;
public void setTitle(String title) {
this.title = title;
public Funder getFunder() {
return funder;
public void setFunder(Funder funders) {
this.funder = funders;
public static Project newInstance(String id, String code, String acronym, String title, Funder funder) {
Project project = new Project();
return project;

View File

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
public class Constants implements Serializable {
// collectedFrom va con isProvidedBy -> becco da ModelSupport
public static final String HOSTED_BY = "isHostedBy";
public static final String HOSTS = "hosts";
// community result uso isrelatedto
public static final String RESULT_ENTITY = "result";
public static final String DATASOURCE_ENTITY = "datasource";
public static final String CONTEXT_ENTITY = "context";
public static final String CONTEXT_ID = "60";
public static final String CONTEXT_NS_PREFIX = "context____";

View File

@ -0,0 +1,316 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.Container;
import eu.dnetlib.dhp.schema.dump.oaf.ControlledField;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
* To store information about the datasource OpenAIRE collects information from. It contains the following parameters: -
* id of type String to store the OpenAIRE id for the datasource. It corresponds to the parameter id of the datasource
* represented in the internal model - originalId of type List<String> to store the list of original ids associated to
* the datasource. It corresponds to the parameter originalId of the datasource represented in the internal model. The
* null values are filtered out - pid of type List<eu.dnetlib.shp.schema.dump.oaf.ControlledField> to store the
* persistent identifiers for the datasource. For each pid in the datasource represented in the internal model one pid
* in the external model is produced as : - schema corresponds to pid.qualifier.classid of the datasource represented in
* the internal model - value corresponds to pid.value of the datasource represented in the internal model -
* datasourceType of type eu.dnetlib.dhp.schema.dump.oaf.ControlledField to store the datasource type (e.g.
* pubsrepository::institutional, Institutional Repository) as in the dnet vocabulary dnet:datasource_typologies. It
* corresponds to datasourcetype of the datasource represented in the internal model and : - code corresponds to
* datasourcetype.classid - value corresponds to datasourcetype.classname - openairecompatibility of type String to
* store information about the OpenAIRE compatibility of the ingested results (which guidelines they are compliant to).
* It corresponds to openairecompatibility.classname of the datasource represented in the internal model - officialname
* of type Sgtring to store the official name of the datasource. It correspond to officialname.value of the datasource
* represented in the internal model - englishname of type String to store the English name of the datasource. It
* corresponds to englishname.value of the datasource represented in the internal model - websiteurl of type String to
* store the URL of the website of the datasource. It corresponds to websiteurl.value of the datasource represented in
* the internal model - logourl of type String to store the URL of the logo for the datasource. It corresponds to
* logourl.value of the datasource represented in the internal model - dateofvalidation of type String to store the data
* of validation against the guidelines for the datasource records. It corresponds to dateofvalidation.value of the
* datasource represented in the internal model - description of type String to store the description for the
* datasource. It corresponds to description.value of the datasource represented in the internal model
public class Datasource implements Serializable {
private String id; // string
private List<String> originalId; // list string
private List<ControlledField> pid; // list<String>
private ControlledField datasourcetype; // value
private String openairecompatibility; // value
private String officialname; // string
private String englishname; // string
private String websiteurl; // string
private String logourl; // string
private String dateofvalidation; // string
private String description; // description
private List<String> subjects; // List<String>
// opendoar specific fields (od*)
private List<String> languages; // odlanguages List<String>
private List<String> contenttypes; // odcontent types List<String>
// re3data fields
private String releasestartdate; // string
private String releaseenddate; // string
private String missionstatementurl; // string
// {open, restricted or closed}
private String accessrights; // databaseaccesstype string
// {open, restricted or closed}
private String uploadrights; // datauploadtype string
// {feeRequired, registration, other}
private String databaseaccessrestriction; // string
// {feeRequired, registration, other}
private String datauploadrestriction; // string
private Boolean versioning; // boolean
private String citationguidelineurl; // string
// {yes, no, uknown}
private String pidsystems; // string
private String certificates; // string
private List<Object> policies; //
private Container journal; // issn etc del Journal
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public List<String> getOriginalId() {
return originalId;
public void setOriginalId(List<String> originalId) {
this.originalId = originalId;
public List<ControlledField> getPid() {
return pid;
public void setPid(List<ControlledField> pid) {
this.pid = pid;
public ControlledField getDatasourcetype() {
return datasourcetype;
public void setDatasourcetype(ControlledField datasourcetype) {
this.datasourcetype = datasourcetype;
public String getOpenairecompatibility() {
return openairecompatibility;
public void setOpenairecompatibility(String openairecompatibility) {
this.openairecompatibility = openairecompatibility;
public String getOfficialname() {
return officialname;
public void setOfficialname(String officialname) {
this.officialname = officialname;
public String getEnglishname() {
return englishname;
public void setEnglishname(String englishname) {
this.englishname = englishname;
public String getWebsiteurl() {
return websiteurl;
public void setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl;
public String getLogourl() {
return logourl;
public void setLogourl(String logourl) {
this.logourl = logourl;
public String getDateofvalidation() {
return dateofvalidation;
public void setDateofvalidation(String dateofvalidation) {
this.dateofvalidation = dateofvalidation;
public String getDescription() {
return description;
public void setDescription(String description) {
this.description = description;
public List<String> getSubjects() {
return subjects;
public void setSubjects(List<String> subjects) {
this.subjects = subjects;
public List<String> getLanguages() {
return languages;
public void setLanguages(List<String> languages) {
this.languages = languages;
public List<String> getContenttypes() {
return contenttypes;
public void setContenttypes(List<String> contenttypes) {
this.contenttypes = contenttypes;
public String getReleasestartdate() {
return releasestartdate;
public void setReleasestartdate(String releasestartdate) {
this.releasestartdate = releasestartdate;
public String getReleaseenddate() {
return releaseenddate;
public void setReleaseenddate(String releaseenddate) {
this.releaseenddate = releaseenddate;
public String getMissionstatementurl() {
return missionstatementurl;
public void setMissionstatementurl(String missionstatementurl) {
this.missionstatementurl = missionstatementurl;
public String getAccessrights() {
return accessrights;
public void setAccessrights(String accessrights) {
this.accessrights = accessrights;
public String getUploadrights() {
return uploadrights;
public void setUploadrights(String uploadrights) {
this.uploadrights = uploadrights;
public String getDatabaseaccessrestriction() {
return databaseaccessrestriction;
public void setDatabaseaccessrestriction(String databaseaccessrestriction) {
this.databaseaccessrestriction = databaseaccessrestriction;
public String getDatauploadrestriction() {
return datauploadrestriction;
public void setDatauploadrestriction(String datauploadrestriction) {
this.datauploadrestriction = datauploadrestriction;
public Boolean getVersioning() {
return versioning;
public void setVersioning(Boolean versioning) {
this.versioning = versioning;
public String getCitationguidelineurl() {
return citationguidelineurl;
public void setCitationguidelineurl(String citationguidelineurl) {
this.citationguidelineurl = citationguidelineurl;
public String getPidsystems() {
return pidsystems;
public void setPidsystems(String pidsystems) {
this.pidsystems = pidsystems;
public String getCertificates() {
return certificates;
public void setCertificates(String certificates) {
this.certificates = certificates;
public List<Object> getPolicies() {
return policies;
public void setPolicies(List<Object> policiesr3) {
this.policies = policiesr3;
public Container getJournal() {
return journal;
public void setJournal(Container journal) {
this.journal = journal;

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
* To store information about the funder funding the project related to the result. It has the following parameters:
* - private String shortName to store the short name of the funder (e.g. AKA)
* - private String name to store information about the name of the funder (e.g. Akademy of Finland)
* - private Fundings funding_stream to store the fundingstream
* - private String jurisdiction to store information about the jurisdiction of the funder
public class Funder implements Serializable {
private String shortName;
private String name;
private Fundings funding_stream;
private String jurisdiction;
public String getShortName() {
return shortName;
public void setShortName(String shortName) {
this.shortName = shortName;
public String getName() {
return name;
public void setName(String name) {
this.name = name;
public String getJurisdiction() {
return jurisdiction;
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
public Fundings getFunding_stream() {
return funding_stream;
public void setFunding_stream(Fundings funding_stream) {
this.funding_stream = funding_stream;

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
* To store inforamtion about the funding stream. It has two parameters:
* - private String id to store the id of the fundings stream. The id is created by appending the shortname of the
* funder to the name of each level in the xml representing the fundng stream. For example: if the funder is the
* European Commission, the funding level 0 name is FP7, the funding level 1 name is SP3 and the funding level 2 name is
* PEOPLE then the id will be: EC::FP7::SP3::PEOPLE
* - private String description to describe the funding stream. It is created by concatenating the description of each funding
* level so for the example above the description would be: SEVENTH FRAMEWORK PROGRAMME - SP3-People - Marie-Curie Actions
public class Fundings implements Serializable {
private String id;
private String description;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public String getDescription() {
return description;
public void setDescription(String description) {
this.description = description;

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.Optional;
* To describe the funded amount. It has the following parameters:
* - private String currency to store the currency of the fund
* - private float totalcost to store the total cost of the project
* - private float fundedamount to store the funded amount by the funder
public class Granted implements Serializable {
private String currency;
private float totalcost;
private float fundedamount;
public String getCurrency() {
return currency;
public void setCurrency(String currency) {
this.currency = currency;
public float getTotalcost() {
return totalcost;
public void setTotalcost(float totalcost) {
this.totalcost = totalcost;
public float getFundedamount() {
return fundedamount;
public void setFundedamount(float fundedamount) {
this.fundedamount = fundedamount;
public static Granted newInstance(String currency, float totalcost, float fundedamount) {
Granted granted = new Granted();
granted.currency = currency;
granted.totalcost = totalcost;
granted.fundedamount = fundedamount;
return granted;
public static Granted newInstance(String currency, float fundedamount) {
Granted granted = new Granted();
granted.currency = currency;
granted.fundedamount = fundedamount;
return granted;

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
* To represent the generic node in a relation. It has the following parameters:
* - private String id the openaire id of the entity in the relation
* - private String type the type of the entity in the relation.
* Consider the generic relation between a Result R and a Project P, the node representing R will have
* as id the id of R and as type result, while the node representing the project will have as id the id of the project
* and as type project
public class Node implements Serializable {
private String id;
private String type;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public String getType() {
return type;
public void setType(String type) {
this.type = type;
public static Node newInstance(String id, String type) {
Node node = new Node();
node.id = id;
node.type = type;
return node;

View File

@ -0,0 +1,89 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.ControlledField;
import eu.dnetlib.dhp.schema.dump.oaf.Country;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
import eu.dnetlib.dhp.schema.dump.oaf.Qualifier;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
* To represent the generic organizaiton. It has the following parameters:
* - private String legalshortname to store the legalshortname of the organizaiton
* - private String legalname to store the legal name of the organization
* - private String websiteurl to store the websiteurl of the organization
* - private List<String> alternativenames to store the alternative names of the organization
* - private Qualifier country to store the country of the organization
* - private String id to store the id of the organization
* - private List<ControlledField> pid to store the list of pids for the organization
public class Organization implements Serializable {
private String legalshortname;
private String legalname;
private String websiteurl;
private List<String> alternativenames;
private Qualifier country;
private String id;
private List<ControlledField> pid;
public String getLegalshortname() {
return legalshortname;
public void setLegalshortname(String legalshortname) {
this.legalshortname = legalshortname;
public String getLegalname() {
return legalname;
public void setLegalname(String legalname) {
this.legalname = legalname;
public String getWebsiteurl() {
return websiteurl;
public void setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl;
public List<String> getAlternativenames() {
return alternativenames;
public void setAlternativenames(List<String> alternativenames) {
this.alternativenames = alternativenames;
public Qualifier getCountry() {
return country;
public void setCountry(Qualifier country) {
this.country = country;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public List<ControlledField> getPid() {
return pid;
public void setPid(List<ControlledField> pid) {
this.pid = pid;

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
* To store information about the ec programme for the project. It has the following parameters:
* - private String code to store the code of the programme
* - private String description to store the description of the programme
public class Programme implements Serializable {
private String code;
private String description;
public String getCode() {
return code;
public void setCode(String code) {
this.code = code;
public String getDescription() {
return description;
public void setDescription(String description) {
this.description = description;
public static Programme newInstance(String code, String description) {
Programme p = new Programme();
p.code = code;
p.description = description;
return p;

View File

@ -0,0 +1,195 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
* This is the class representing the Project in the model used for the dumps of the whole graph. At the moment the dump
* of the Projects differs from the other dumps because we do not create relations between Funders (Organization) and
* Projects but we put the information about the Funder within the Project representation. We also removed the
* collected from element from the Project. No relation between the Project and the Datasource entity from which it is
* collected will be created. We will never create relations between Project and Datasource. In case some relation will
* be extracted from the Project they will refer the Funder and will be of type ( organization -> funds -> project,
* project -> isFundedBy -> organization) We also removed the duration parameter because the most of times it is set to
* 0. It has the following parameters:
* - private String id to store the id of the project (OpenAIRE id)
* - private String websiteurl to store the websiteurl of the project
* - private String code to store the grant agreement of the project
* - private String acronym to store the acronym of the project
* - private String title to store the tile of the project
* - private String startdate to store the start date
* - private String enddate to store the end date
* - private String callidentifier to store the call indentifier
* - private String keywords to store the keywords
* - private boolean openaccessmandateforpublications to store if the project must accomplish to the open access mandate
* for publications. This value will be set to true if one of the field in the project represented in the internal model
* is set to true
* - private boolean openaccessmandatefordataset to store if the project must accomplish to the open access mandate for
* dataset. It is set to the value in the corresponding filed of the project represented in the internal model
* - private List<String> subject to store the list of subjects of the project
* - private List<Funder> funding to store the list of funder of the project
* - private String summary to store the summary of the project
* - private Granted granted to store the granted amount
* - private List<Programme> programme to store the list of programmes the project is related to
public class Project implements Serializable {
private String id;
private String websiteurl;
private String code;
private String acronym;
private String title;
private String startdate;
private String enddate;
private String callidentifier;
private String keywords;
private boolean openaccessmandateforpublications;
private boolean openaccessmandatefordataset;
private List<String> subject;
private List<Funder> funding;
private String summary;
private Granted granted;
private List<Programme> programme;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public String getWebsiteurl() {
return websiteurl;
public void setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl;
public String getCode() {
return code;
public void setCode(String code) {
this.code = code;
public String getAcronym() {
return acronym;
public void setAcronym(String acronym) {
this.acronym = acronym;
public String getTitle() {
return title;
public void setTitle(String title) {
this.title = title;
public String getStartdate() {
return startdate;
public void setStartdate(String startdate) {
this.startdate = startdate;
public String getEnddate() {
return enddate;
public void setEnddate(String enddate) {
this.enddate = enddate;
public String getCallidentifier() {
return callidentifier;
public void setCallidentifier(String callidentifier) {
this.callidentifier = callidentifier;
public String getKeywords() {
return keywords;
public void setKeywords(String keywords) {
this.keywords = keywords;
public boolean isOpenaccessmandateforpublications() {
return openaccessmandateforpublications;
public void setOpenaccessmandateforpublications(boolean openaccessmandateforpublications) {
this.openaccessmandateforpublications = openaccessmandateforpublications;
public boolean isOpenaccessmandatefordataset() {
return openaccessmandatefordataset;
public void setOpenaccessmandatefordataset(boolean openaccessmandatefordataset) {
this.openaccessmandatefordataset = openaccessmandatefordataset;
public List<String> getSubject() {
return subject;
public void setSubject(List<String> subject) {
this.subject = subject;
public List<Funder> getFunding() {
return funding;
public void setFunding(List<Funder> funding) {
this.funding = funding;
public String getSummary() {
return summary;
public void setSummary(String summary) {
this.summary = summary;
public Granted getGranted() {
return granted;
public void setGranted(Granted granted) {
this.granted = granted;
public List<Programme> getProgramme() {
return programme;
public void setProgramme(List<Programme> programme) {
this.programme = programme;

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
* To represent the semantics of the generic relation between two entities. It has the following parameters:
* - private String name to store the semantics of the relation (i.e. isAuthorInstitutionOf). It corresponds to the
* relclass parameter in the relation represented in the internal model
* represented in the internal model
* - private String type to store the type of the relation (i.e. affiliation). It corresponds to the subreltype parameter
* of the relation represented in theinternal model
public class RelType implements Serializable {
private String name; // relclass
private String type; // subreltype
public String getName() {
return name;
public void setName(String name) {
this.name = name;
public String getType() {
return type;
public void setType(String type) {
this.type = type;
public static RelType newInstance(String name, String type) {
RelType rel = new RelType();
rel.name = name;
rel.type = type;
return rel;

View File

@ -0,0 +1,68 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.Objects;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
* To represent the gereric relation between two entities. It has the following parameters:
* - private Node source to represent the entity source of the relation
* - private Node target to represent the entity target of the relation
* - private RelType reltype to represent the semantics of the relation
* - private Provenance provenance to represent the provenance of the relation
public class Relation implements Serializable {
private Node source;
private Node target;
private RelType reltype;
private Provenance provenance;
public Node getSource() {
return source;
public void setSource(Node source) {
this.source = source;
public Node getTarget() {
return target;
public void setTarget(Node target) {
this.target = target;
public RelType getReltype() {
return reltype;
public void setReltype(RelType reltype) {
this.reltype = reltype;
public Provenance getProvenance() {
return provenance;
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
public int hashCode() {
return Objects.hash(source.getId(), target.getId(), reltype.getType() + ":" + reltype.getName());
public static Relation newInstance(Node source, Node target, RelType reltype, Provenance provenance) {
Relation relation = new Relation();
relation.source = source;
relation.target = target;
relation.reltype = reltype;
relation.provenance = provenance;
return relation;

View File

@ -0,0 +1,20 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.util.List;
* To represent RC entities. It extends eu.dnetlib.dhp.dump.oaf.grap.ResearchInitiative by adding the parameter subject
* to store the list of subjects related to the community
public class ResearchCommunity extends ResearchInitiative {
private List<String> subject;
public List<String> getSubject() {
return subject;
public void setSubject(List<String> subject) {
this.subject = subject;

View File

@ -0,0 +1,71 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
* To represent entity of type RC/RI. It has the following parameters, which are mostly derived by the profile
* - private String id to store the openaire id for the entity. Is has as code 00 and will be created as
* 00|context_____::md5(originalId)
* private String originalId to store the id of the context as provided in the profile (i.e. mes)
* private String name to store the name of the context (got from the label attribute in the context definition)
* private String type to store the type of the context (i.e.: research initiative or research community)
* private String description to store the description of the context as given in the profile
* private String zenodo_community to store the zenodo community associated to the context (main zenodo community)
public class ResearchInitiative implements Serializable {
private String id; // openaireId
private String originalId; // context id
private String name; // context name
private String type; // context type: research initiative or research community
private String description;
private String zenodo_community;
public String getZenodo_community() {
return zenodo_community;
public void setZenodo_community(String zenodo_community) {
this.zenodo_community = zenodo_community;
public String getType() {
return type;
public void setType(String type) {
this.type = type;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public String getName() {
return name;
public void setName(String label) {
this.name = label;
public String getOriginalId() {
return originalId;
public void setOriginalId(String originalId) {
this.originalId = originalId;
public String getDescription() {
return description;
public void setDescription(String description) {
this.description = description;

View File

@ -110,13 +110,6 @@ public class CommunityConfigurationFactory {
private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) {
// final Node oacommunitynode = node.selectSingleNode("./oacommunity");
// String oacommunity = null;
// if (oacommunitynode != null) {
// String tmp = oacommunitynode.getText();
// if (StringUtils.isNotBlank(tmp))
// oacommunity = tmp;
// }
final List<Node> list = node.selectNodes("./zenodocommunities/zenodocommunity");
final List<ZenodoCommunity> zenodoCommunityList = new ArrayList<>();
@ -127,11 +120,7 @@ public class CommunityConfigurationFactory {
// if (oacommunity != null) {
// ZenodoCommunity zc = new ZenodoCommunity();
// zc.setZenodoCommunityId(oacommunity);
// zenodoCommunityList.add(zc);
// }
log.info("size of the zenodo community list " + zenodoCommunityList.size());
return zenodoCommunityList;

View File

@ -20,8 +20,6 @@ import eu.dnetlib.dhp.schema.oaf.*;
/** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable {
private String trust = "0.8";
private boolean clearContext(Result result) {
int tmp = result.getContext().size();
List<Context> clist = result
@ -72,8 +70,9 @@ public class ResultTagger implements Serializable {
// tagging for Subject
final Set<String> subjects = new HashSet<>();
if (Objects.nonNull(result.getSubject())){
if (Objects.nonNull(result.getSubject())) {
.map(subject -> subject.getValue())
@ -91,13 +90,13 @@ public class ResultTagger implements Serializable {
if (Objects.nonNull(result.getInstance())) {
for (Instance i : result.getInstance()) {
if (Objects.nonNull(i.getCollectedfrom())) {
if (Objects.nonNull(i.getCollectedfrom().getKey())) {
tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
if (Objects.nonNull(i.getHostedby())) {
if (Objects.nonNull(i.getHostedby().getKey())) {
tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
@ -170,21 +169,24 @@ public class ResultTagger implements Serializable {
if (datasources.contains(c.getId()))
if (czenodo.contains(c.getId()))
return c;
@ -210,21 +212,24 @@ public class ResultTagger implements Serializable {
if (datasources.contains(c))
if (czenodo.contains(c))
return context;
@ -235,11 +240,12 @@ public class ResultTagger implements Serializable {
public static DataInfo getDataInfo(
String inference_provenance, String inference_class_id, String inference_class_name) {
String inference_provenance, String inference_class_id, String inference_class_name, String trust) {
DataInfo di = new DataInfo();
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
return di;

View File

@ -14,4 +14,6 @@ public class TaggingConstants {
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
public static final String TAGGING_TRUST = "0.8";

View File

@ -106,12 +106,6 @@
<community id="aginfra">

View File

@ -42,6 +42,12 @@
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-compress -->
@ -59,6 +65,12 @@
@ -92,14 +104,21 @@

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.util.Map;
import com.google.common.collect.Maps;
public class Constants {
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
public static final String INFERRED = "Inferred by OpenAIRE";
public static final String HARVESTED = "Harvested";
public static final String DEFAULT_TRUST = "0.9";
public static final String USER_CLAIM = "Linked by user";;
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
public static String ZENODO_COMMUNITY_PREFIX = "https://zenodo.org/communities/";
public static String RESEARCH_COMMUNITY = "Research Community";
public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative";
public static String ORCID = "orcid";
static {
accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
accessRightsCoarMap.put("CLOSED", "c_14cb");
accessRightsCoarMap.put("EMBARGO", "c_f1cf");
static {
coarCodeLabelMap.put("c_abf2", "OPEN");
coarCodeLabelMap.put("c_16ec", "RESTRICTED");
coarCodeLabelMap.put("c_14cb", "CLOSED");
coarCodeLabelMap.put("c_f1cf", "EMBARGO");

View File

@ -0,0 +1,106 @@
package eu.dnetlib.dhp.oa.graph.dump;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
import eu.dnetlib.dhp.schema.oaf.*;
* It fires the execution of the actual dump for result entities. If the dump is for RC/RI products its checks for each
* result its belongingess to at least one RC/RI before "asking" for its mapping.
public class DumpProducts implements Serializable {
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath,
Class<? extends OafEntity> inputClazz,
Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> outputClazz,
boolean graph) {
SparkConf conf = new SparkConf();
spark -> {
Utils.removeOutputDir(spark, outputPath);
execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, graph);
public static <I extends OafEntity, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> void execDump(
SparkSession spark,
String inputPath,
String outputPath,
String communityMapPath,
Class<I> inputClazz,
Class<O> outputClazz,
boolean graph) {
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
.readPath(spark, inputPath, inputClazz)
.map(value -> execMap(value, communityMap, graph), Encoders.bean(outputClazz))
.option("compression", "gzip")
private static <I extends OafEntity, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O execMap(I value,
CommunityMap communityMap,
boolean graph) {
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
if (odInfo.isPresent()) {
if (odInfo.get().getDeletedbyinference()) {
return null;
} else {
return null;
if (!graph) {
Set<String> communities = communityMap.keySet();
Optional<List<Context>> inputContext = Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Result) value).getContext());
if (!inputContext.isPresent()) {
return null;
List<String> toDumpFor = inputContext.get().stream().map(c -> {
if (communities.contains(c.getId())) {
return c.getId();
if (c.getId().contains("::") && communities.contains(c.getId().substring(0, c.getId().indexOf("::")))) {
return c.getId().substring(0, 3);
return null;
if (toDumpFor.size() == 0) {
return null;
return (O) ResultMapper.map(value, communityMap, graph);

View File

@ -0,0 +1,114 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.io.*;
import org.apache.commons.compress.archivers.ar.ArArchiveEntry;
import org.apache.commons.compress.archivers.ar.ArArchiveOutputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
public class MakeTar implements Serializable {
private static final Logger log = LoggerFactory.getLogger(MakeTar.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
final String outputPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", outputPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
makeTArArchive(fileSystem, inputPath, outputPath);
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException {
RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
while (dir_iterator.hasNext()) {
LocatedFileStatus fileStatus = dir_iterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
fsDataOutputStream = fileSystem.create(hdfsWritePath);
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
new Path(inputPath), true);
while (fileStatusListIterator.hasNext()) {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
InputStream is = fileSystem.open(fileStatus.getPath());
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte data[] = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);

View File

@ -0,0 +1,58 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.io.StringReader;
import java.util.List;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class QueryInformationSystem {
private ISLookUpService isLookUp;
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
" and ($x//context/param[./@name = 'status']/text() = 'manager' or $x//context/param[./@name = 'status']/text() = 'all') "
" return " +
"<community> " +
"{$x//CONFIGURATION/context/@id}" +
"{$x//CONFIGURATION/context/@label}" +
public CommunityMap getCommunityMap()
throws ISLookUpException, DocumentException {
return getMap(isLookUp.quickSearchProfile(XQUERY));
public ISLookUpService getIsLookUp() {
return isLookUp;
public void setIsLookUp(ISLookUpService isLookUpService) {
this.isLookUp = isLookUpService;
private CommunityMap getMap(List<String> communityMap) throws DocumentException {
final CommunityMap map = new CommunityMap();
for (String xml : communityMap) {
final Document doc;
doc = new SAXReader().read(new StringReader(xml));
Element root = doc.getRootElement();
map.put(root.attribute("id").getValue(), root.attribute("label").getValue());
return map;

View File

@ -0,0 +1,523 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.dump.oaf.community.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ResultMapper implements Serializable {
public static <I extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
I in, Map<String, String> communityMap, boolean graph) {
Result out;
if (graph) {
out = new Result();
} else {
out = new CommunityResult();
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
if (ort.isPresent()) {
switch (ort.get().getClassid()) {
case "publication":
Optional<Journal> journal = Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
if (journal.isPresent()) {
Journal j = journal.get();
Container c = new Container();
case "dataset":
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
igl -> igl
.map(gli -> {
GeoLocation gl = new GeoLocation();
return gl;
case "software":
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
value -> out
.map(v -> v.getValue())
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
case "other":
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
// I do not map Access Right UNKNOWN or OTHER
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if (oar.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
final List<String> contributorList = new ArrayList<>();
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
// List<Country> countryList = new ArrayList<>();
value -> out
c -> {
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
return null;
Country country = new Country();
provenance -> country
return country;
// out.setCountry(countryList);
final List<String> coverageList = new ArrayList<>();
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
final List<String> descriptionList = new ArrayList<>();
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
if (oStr.isPresent()) {
final List<String> formatList = new ArrayList<>();
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
final List<Instance> instanceList = new ArrayList<>();
inst -> inst
.forEach(i -> instanceList.add(getInstance(i, graph))));
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
if (oL.isPresent()) {
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
if (oLong.isPresent()) {
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
if (otitle.isPresent()) {
List<StructuredProperty> iTitle = otitle
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
if (iTitle.size() > 0) {
iTitle = otitle
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
if (iTitle.size() > 0) {
List<ControlledField> pids = new ArrayList<>();
value -> value
p -> pids
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
oStr = Optional.ofNullable(input.getDateofacceptance());
if (oStr.isPresent()) {
oStr = Optional.ofNullable(input.getPublisher());
if (oStr.isPresent()) {
List<String> sourceList = new ArrayList<>();
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
List<Subject> subjectList = new ArrayList<>();
value -> value
.forEach(s -> subjectList.add(getSubject(s))));
if (!graph) {
((CommunityResult) out)
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
Set<String> communities = communityMap.keySet();
List<Context> contextList = Optional
value -> value
.map(c -> {
String community_id = c.getId();
if (community_id.indexOf("::") > 0) {
community_id = community_id.substring(0, community_id.indexOf("::"));
if (communities.contains(community_id)) {
Context context = new Context();
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo());
if (dataInfo.isPresent()) {
List<Provenance> provenance = new ArrayList<>();
di -> Optional
provenanceaction -> Provenance
provenanceaction.getClassname(), di.getTrust()))
return context;
return null;
.orElse(new ArrayList<>());
if (contextList.size() > 0) {
Set<Integer> hashValue = new HashSet<>();
List<Context> remainigContext = new ArrayList<>();
contextList.forEach(c -> {
if (!hashValue.contains(c.hashCode())) {
((CommunityResult) out).setContext(remainigContext);
return out;
private static Instance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i, boolean graph) {
Instance instance = new Instance();
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
if (opAr.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
.ifPresent(value -> instance.setLicense(value.getValue()));
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
.ifPresent(value -> instance.setRefereed(value.getClassname()));
// .ifPresent(value -> instance.setRefereed(value.getValue()));
.ifPresent(value -> instance.setType(value.getClassname()));
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
return instance;
private static List<Provenance> getUniqueProvenance(List<Provenance> provenance) {
Provenance iProv = new Provenance();
// iProv.setProvenance(Constants.INFERRED);
Provenance hProv = new Provenance();
// hProv.setProvenance(Constants.HARVESTED);
Provenance lProv = new Provenance();
for (Provenance p : provenance) {
switch (p.getProvenance()) {
case Constants.HARVESTED:
hProv = getHighestTrust(hProv, p);
case Constants.INFERRED:
iProv = getHighestTrust(iProv, p);
// To be removed as soon as the new beta run has been done
// this fixex issue of not set trust during bulktagging
if (StringUtils.isEmpty(iProv.getTrust())) {
case Constants.USER_CLAIM:
lProv = getHighestTrust(lProv, p);
return Arrays
.asList(iProv, hProv, lProv)
.filter(p -> !StringUtils.isEmpty(p.getProvenance()))
private static Provenance getHighestTrust(Provenance hProv, Provenance p) {
if (StringUtils.isNoneEmpty(hProv.getTrust(), p.getTrust()))
return hProv.getTrust().compareTo(p.getTrust()) > 0 ? hProv : p;
return (StringUtils.isEmpty(p.getTrust()) && !StringUtils.isEmpty(hProv.getTrust())) ? hProv : p;
private static Subject getSubject(StructuredProperty s) {
Subject subject = new Subject();
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
if (di.isPresent()) {
Provenance p = new Provenance();
return subject;
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
Author a = new Author();
Optional<List<StructuredProperty>> oPids = Optional
if (oPids.isPresent()) {
Pid pid = getOrcid(oPids.get());
if (pid != null) {
return a;
private static Pid getOrcid(List<StructuredProperty> p) {
for (StructuredProperty pid : p) {
if (pid.getQualifier().getClassid().equals(Constants.ORCID)) {
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
if (di.isPresent()) {
return Pid
} else {
return Pid
return null;

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.dom4j.DocumentException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
* This class connects with the IS related to the isLookUpUrl got as parameter. It saves the information about the
* context that will guide the dump of the results. The information saved is a HashMap. The key is the id of a community
* - research infrastructure/initiative , the value is the label of the research community - research
* infrastructure/initiative.
public class SaveCommunityMap implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SaveCommunityMap.class);
private final QueryInformationSystem queryInformationSystem;
private final Configuration conf;
private final BufferedWriter writer;
public SaveCommunityMap(String hdfsPath, String hdfsNameNode, String isLookUpUrl) throws IOException {
conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fsDataOutputStream = fileSystem.create(hdfsWritePath);
queryInformationSystem = new QueryInformationSystem();
writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
final String nameNode = parser.get("nameNode");
log.info("nameNode: {}", nameNode);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("isLookUpUrl: {}", isLookUpUrl);
final SaveCommunityMap scm = new SaveCommunityMap(outputPath, nameNode, isLookUpUrl);
private void saveCommunityMap() throws ISLookUpException, IOException, DocumentException {

View File

@ -0,0 +1,86 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
import eu.dnetlib.dhp.common.api.ZenodoAPIClient;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
public class SendToZenodoHDFS implements Serializable {
private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class);
public static void main(final String[] args) throws Exception, MissingConceptDoiException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("nameNode");
final String access_token = parser.get("accessToken");
final String connection_url = parser.get("connectionUrl");
final String metadata = parser.get("metadata");
final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition"));
final String concept_rec_id = Optional
final String communityMapPath = parser.get("communityMapPath");
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
CommunityMap communityMap = Utils.readCommunityMap(fileSystem, communityMapPath);
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
new Path(hdfsPath), true);
ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
if (newDeposition) {
} else {
if (concept_rec_id == null) {
throw new MissingConceptDoiException("No concept record id has been provided");
while (fileStatusListIterator.hasNext()) {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
// String tmp = p_string.substring(0, p_string.lastIndexOf("/"));
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
log.info("Sending information for community: " + name);
if (communityMap.containsKey(name.substring(0, name.lastIndexOf(".")))) {
name = communityMap.get(name.substring(0, name.lastIndexOf("."))).replace(" ", "_") + ".tar";
FSDataInputStream inputStream = fileSystem.open(p);
zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen());

View File

@ -0,0 +1,73 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.graph.Constants;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class Utils {
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
public static ISLookUpService getIsLookUpService(String isLookUpUrl) {
return ISLookupClientFactory.getLookUpService(isLookUpUrl);
public static String getContextId(String id) {
return String
"%s|%s::%s", Constants.CONTEXT_ID, Constants.CONTEXT_NS_PREFIX,
public static CommunityMap getCommunityMap(SparkSession spark, String communityMapPath) {
return new Gson().fromJson(spark.read().textFile(communityMapPath).collectAsList().get(0), CommunityMap.class);
public static CommunityMap readCommunityMap(FileSystem fileSystem, String communityMapPath) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(fileSystem.open(new Path(communityMapPath))));
StringBuffer sb = new StringBuffer();
try {
String line;
while ((line = br.readLine()) != null) {
} finally {
return new Gson().fromJson(sb.toString(), CommunityMap.class);

View File

@ -0,0 +1,8 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable;
import java.util.HashMap;
public class CommunityMap extends HashMap<String, String> implements Serializable {

View File

@ -0,0 +1,83 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
* This class splits the dumped results according to the research community - research initiative/infrastructure they
* are related to. The information about the community is found in the element "context.id" in the result. Since the
* context that can be found in the result can be associated not only to communities, a community Map is provided. It
* will guide the splitting process. Note: the repartition(1) just before writing the results related to a community.
* This is a choice due to uploading constraints (just one file for each community) As soon as a better solution will be
* in place remove the repartition
public class CommunitySplit implements Serializable {
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath) {
SparkConf conf = new SparkConf();
spark -> {
Utils.removeOutputDir(spark, outputPath);
execSplit(spark, inputPath, outputPath, Utils.getCommunityMap(spark, communityMapPath).keySet());
private static void execSplit(SparkSession spark, String inputPath, String outputPath,
Set<String> communities) {
Dataset<CommunityResult> result = Utils
.readPath(spark, inputPath + "/publication", CommunityResult.class)
.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
.union(Utils.readPath(spark, inputPath + "/orp", CommunityResult.class))
.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));
.forEach(c -> printResult(c, result, outputPath));
private static void printResult(String c, Dataset<CommunityResult> result, String outputPath) {
Dataset<CommunityResult> community_products = result
.filter(r -> containsCommunity(r, c));
try {
.option("compression", "gzip")
.json(outputPath + "/" + c);
} catch (Exception e) {
private static boolean containsCommunity(CommunityResult r, String c) {
if (Optional.ofNullable(r.getContext()).isPresent()) {
return r
.filter(con -> con.getCode().equals(c))
.size() > 0;
return false;

View File

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
public class ResultProject implements Serializable {
private String resultId;
private List<Project> projectsList;
public String getResultId() {
return resultId;
public void setResultId(String resultId) {
this.resultId = resultId;
public List<Project> getProjectsList() {
return projectsList;
public void setProjectsList(List<Project> projectsList) {
this.projectsList = projectsList;

View File

@ -0,0 +1,62 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.oaf.Result;
* Spark action to trigger the dump of results associated to research community - reseach initiative/infrasctructure The
* actual dump if performed via the class DumpProducts that is used also for the entire graph dump
public class SparkDumpCommunityProducts implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
String communityMapPath = parser.get("communityMapPath");
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
DumpProducts dump = new DumpProducts();
isSparkSessionManaged, inputPath, outputPath, communityMapPath, inputClazz, CommunityResult.class,

View File

@ -0,0 +1,185 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.community.Funder;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
* Preparation of the Project information to be added to the dumped results. For each result associated to at least one
* Project, a serialization of an instance af ResultProject closs is done. ResultProject contains the resultId, and the
* list of Projects (as in eu.dnetlib.dhp.schema.dump.oaf.community.Project) it is associated to
public class SparkPrepareResultProject implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultProject.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
spark -> {
Utils.removeOutputDir(spark, outputPath);
prepareResultProjectList(spark, inputPath, outputPath);
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath) {
Dataset<Relation> relation = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter("dataInfo.deletedbyinference = false and relClass = 'produces'");
Dataset<eu.dnetlib.dhp.schema.oaf.Project> projects = Utils
.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.schema.oaf.Project.class);
.joinWith(relation, projects.col("id").equalTo(relation.col("source")))
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, String>) value -> value
(MapGroupsFunction<String, Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, ResultProject>) (s,
it) -> {
Set<String> projectSet = new HashSet<>();
Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next();
ResultProject rp = new ResultProject();
eu.dnetlib.dhp.schema.oaf.Project p = first._1();
Project ps = getProject(p);
List<Project> projList = new ArrayList<>();
it.forEachRemaining(c -> {
eu.dnetlib.dhp.schema.oaf.Project op = c._1();
if (!projectSet.contains(op.getId())) {
return rp;
}, Encoders.bean(ResultProject.class))
.option("compression", "gzip")
private static Project getProject(eu.dnetlib.dhp.schema.oaf.Project op) {
Project p = Project
.map(a -> a.getValue())
.map(v -> v.getValue())
value -> value
.map(ft -> getFunder(ft.getValue()))
Optional<DataInfo> di = Optional.ofNullable(op.getDataInfo());
Provenance provenance = new Provenance();
if (di.isPresent()) {
return p;
private static Funder getFunder(String fundingtree) {
// ["<fundingtree><funder><id>nsf_________::NSF</id><shortname>NSF</shortname><name>National Science
// Foundation</name><jurisdiction>US</jurisdiction></funder><funding_level_1><id>nsf_________::NSF::CISE/OAD::CISE/CCF</id><description>Division
// of Computing and Communication Foundations</description><name>Division of Computing and Communication
// Foundations</name><parent><funding_level_0><id>nsf_________::NSF::CISE/OAD</id><description>Directorate for
// Computer &amp; Information Science &amp; Engineering</description><name>Directorate for Computer &amp;
// Information Science &amp;
// Engineering</name><parent/><class>nsf:fundingStream</class></funding_level_0></parent></funding_level_1></fundingtree>"]
Funder f = new Funder();
final Document doc;
try {
doc = new SAXReader().read(new StringReader(fundingtree));
f.setShortName(((Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
f.setName(((Node) (doc.selectNodes("//funder/name").get(0))).getText());
f.setJurisdiction(((Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
for (Object o : doc.selectNodes("//funding_level_0")) {
List node = ((Node) o).selectNodes("./name");
f.setFundingStream(((Node) node.get(0)).getText());
return f;
} catch (DocumentException e) {
return f;

View File

@ -0,0 +1,50 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
* Spark job to trigger the split of results associated to research community - reseach initiative/infrasctructure. The
* actual split is performed by the class CommunitySplit
public class SparkSplitForCommunity implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkSplitForCommunity.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String communityMapPath = parser.get("communityMapPath");
CommunitySplit split = new CommunitySplit();
split.run(isSparkSessionManaged, inputPath, outputPath, communityMapPath);

View File

@ -0,0 +1,90 @@
package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
public class SparkUpdateProjectInfo implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String preparedInfoPath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", preparedInfoPath);
SparkConf conf = new SparkConf();
spark -> {
Utils.removeOutputDir(spark, outputPath);
extend(spark, inputPath, outputPath, preparedInfoPath);// , inputClazz);
private static void extend(
SparkSession spark,
String inputPath,
String outputPath,
String preparedInfoPath) {
Dataset<CommunityResult> result = Utils.readPath(spark, inputPath, CommunityResult.class);
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
resultProject, result.col("id").equalTo(resultProject.col("resultId")),
.map(value -> {
CommunityResult r = value._1();
Optional.ofNullable(value._2()).ifPresent(rp -> {
return r;
}, Encoders.bean(CommunityResult.class))
.option("compression", "gzip")

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.Serializable;
public class Constants implements Serializable {
public static final String IS_HOSTED_BY = "isHostedBy";
public static final String HOSTS = "hosts";
public static final String IS_FUNDED_BY = "isFundedBy";
public static final String FUNDS = "funds";
public static final String FUNDINGS = "fundings";
public static final String RESULT_ENTITY = "result";
public static final String DATASOURCE_ENTITY = "datasource";
public static final String CONTEXT_ENTITY = "context";
public static final String ORGANIZATION_ENTITY = "organization";
public static final String PROJECT_ENTITY = "project";
public static final String CONTEXT_ID = "00";
public static final String CONTEXT_NS_PREFIX = "context_____";
// public static final String FUNDER_DS = "entityregistry::projects";

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.Serializable;
import java.util.List;
* Deserialization of the information in the context needed to create Context Entities, and relations between context
* entities and datasources and projects
public class ContextInfo implements Serializable {
private String id;
private String description;
private String type;
private String zenodocommunity;
private String name;
private List<String> projectList;
private List<String> datasourceList;
private List<String> subject;
public List<String> getSubject() {
return subject;
public void setSubject(List<String> subject) {
this.subject = subject;
public String getName() {
return name;
public void setName(String name) {
this.name = name;
public String getId() {
return id;
public void setId(String id) {
this.id = id;
public String getDescription() {
return description;
public void setDescription(String description) {
this.description = description;
public String getType() {
return type;
public void setType(String type) {
this.type = type;
public String getZenodocommunity() {
return zenodocommunity;
public void setZenodocommunity(String zenodocommunity) {
this.zenodocommunity = zenodocommunity;
public List<String> getProjectList() {
return projectList;
public void setProjectList(List<String> projectList) {
this.projectList = projectList;
public List<String> getDatasourceList() {
return datasourceList;
public void setDatasourceList(List<String> datasourceList) {
this.datasourceList = datasourceList;

View File

@ -0,0 +1,105 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.function.Consumer;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
* Writes on HDFS Context entities. It queries the Information System at the lookup url provided as parameter and
* collects the general information for contexes of type community or ri. The general information is the id of the
* context, its label, the subjects associated to the context, its zenodo community, description and type. This
* information is used to create a new Context Entity
public class CreateContextEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CreateContextEntities.class);
private final Configuration conf;
private final BufferedWriter writer;
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("isLookUpUrl: {}", isLookUpUrl);
final CreateContextEntities cce = new CreateContextEntities(hdfsPath, hdfsNameNode);
log.info("Processing contexts...");
cce.execute(Process::getEntity, isLookUpUrl);
private void close() throws IOException {
public CreateContextEntities(String hdfsPath, String hdfsNameNode) throws IOException {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fsDataOutputStream = fileSystem.append(hdfsWritePath);
} else {
fsDataOutputStream = fileSystem.create(hdfsWritePath);
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
public <R extends ResearchInitiative> void execute(final Function<ContextInfo, R> producer, String isLookUpUrl)
throws Exception {
QueryInformationSystem queryInformationSystem = new QueryInformationSystem();
final Consumer<ContextInfo> consumer = ci -> writeEntity(producer.apply(ci));
protected <R extends ResearchInitiative> void writeEntity(final R r) {
try {
// log.info("writing context : {}", new Gson().toJson(r));
} catch (final Exception e) {
throw new RuntimeException(e);

View File

@ -0,0 +1,124 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Optional;
import java.util.function.Consumer;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
* Writes the set of new Relation between the context and datasources. At the moment the relation between the context
* and the project is not created because of a low coverage in the profiles of openaire ids related to projects
public class CreateContextRelation implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CreateContextEntities.class);
private final Configuration conf;
private final BufferedWriter writer;
private final QueryInformationSystem queryInformationSystem;
private static final String CONTEX_RELATION_DATASOURCE = "contentproviders";
private static final String CONTEX_RELATION_PROJECT = "projects";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("isLookUpUrl: {}", isLookUpUrl);
final CreateContextRelation cce = new CreateContextRelation(hdfsPath, hdfsNameNode, isLookUpUrl);
log.info("Creating relation for datasource...");
cce.execute(Process::getRelation, CONTEX_RELATION_DATASOURCE, ModelSupport.getIdPrefix(Datasource.class));
log.info("Creating relations for projects... ");
// cce
// .execute(
// Process::getRelation, CONTEX_RELATION_PROJECT,
// ModelSupport.getIdPrefix(eu.dnetlib.dhp.schema.oaf.Project.class));
private void close() throws IOException {
public CreateContextRelation(String hdfsPath, String hdfsNameNode, String isLookUpUrl)
throws IOException, ISLookUpException {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
queryInformationSystem = new QueryInformationSystem();
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fsDataOutputStream = fileSystem.append(hdfsWritePath);
} else {
fsDataOutputStream = fileSystem.create(hdfsWritePath);
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
public void execute(final Function<ContextInfo, List<Relation>> producer, String category, String prefix) {
final Consumer<ContextInfo> consumer = ci -> producer.apply(ci).forEach(c -> writeEntity(c));
queryInformationSystem.getContextRelation(consumer, category, prefix);
protected void writeEntity(final Relation r) {
try {
} catch (final Exception e) {
throw new RuntimeException(e);

View File

@ -0,0 +1,496 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
* Dumps of entities in the model defined in eu.dnetlib.dhp.schema.dump.oaf.graph. Results are dumped using the same
* Mapper as for eu.dnetlib.dhp.schema.dump.oaf.community, while for the other entities the mapping is defined below
public class DumpGraphEntities implements Serializable {
public void run(Boolean isSparkSessionManaged,
String inputPath,
String outputPath,
Class<? extends OafEntity> inputClazz,
String communityMapPath) {
SparkConf conf = new SparkConf();
switch (ModelSupport.idPrefixMap.get(inputClazz)) {
case "50":
DumpProducts d = new DumpProducts();
isSparkSessionManaged, inputPath, outputPath, communityMapPath, inputClazz, Result.class,
case "40":
spark -> {
Utils.removeOutputDir(spark, outputPath);
projectMap(spark, inputPath, outputPath, inputClazz);
case "20":
spark -> {
Utils.removeOutputDir(spark, outputPath);
organizationMap(spark, inputPath, outputPath, inputClazz);
case "10":
spark -> {
Utils.removeOutputDir(spark, outputPath);
datasourceMap(spark, inputPath, outputPath, inputClazz);
private static <E extends OafEntity> void datasourceMap(SparkSession spark, String inputPath, String outputPath,
Class<E> inputClazz) {
.readPath(spark, inputPath, inputClazz)
.map(d -> mapDatasource((eu.dnetlib.dhp.schema.oaf.Datasource) d), Encoders.bean(Datasource.class))
.option("compression", "gzip")
private static <E extends OafEntity> void projectMap(SparkSession spark, String inputPath, String outputPath,
Class<E> inputClazz) {
.readPath(spark, inputPath, inputClazz)
.map(p -> mapProject((eu.dnetlib.dhp.schema.oaf.Project) p), Encoders.bean(Project.class))
.option("compression", "gzip")
private static Datasource mapDatasource(eu.dnetlib.dhp.schema.oaf.Datasource d) {
Datasource datasource = new Datasource();
oId -> datasource.setOriginalId(oId.stream().filter(Objects::nonNull).collect(Collectors.toList())));
pids -> pids
.map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue()))
dsType -> datasource
.setDatasourcetype(ControlledField.newInstance(dsType.getClassid(), dsType.getClassname())));
.ifPresent(v -> datasource.setOpenairecompatibility(v.getClassname()));
.ifPresent(oname -> datasource.setOfficialname(oname.getValue()));
.ifPresent(ename -> datasource.setEnglishname(ename.getValue()));
.ifPresent(wsite -> datasource.setWebsiteurl(wsite.getValue()));
.ifPresent(lurl -> datasource.setLogourl(lurl.getValue()));
.ifPresent(dval -> datasource.setDateofvalidation(dval.getValue()));
.ifPresent(dex -> datasource.setDescription(dex.getValue()));
sbjs -> datasource.setSubjects(sbjs.stream().map(sbj -> sbj.getValue()).collect(Collectors.toList())));
.ifPresent(odp -> datasource.setPolicies(Arrays.asList(odp.getValue())));
langs -> datasource
.setLanguages(langs.stream().map(lang -> lang.getValue()).collect(Collectors.toList())));
ctypes -> datasource
.setContenttypes(ctypes.stream().map(ctype -> ctype.getValue()).collect(Collectors.toList())));
.ifPresent(rd -> datasource.setReleasestartdate(rd.getValue()));
.ifPresent(ed -> datasource.setReleaseenddate(ed.getValue()));
.ifPresent(ms -> datasource.setMissionstatementurl(ms.getValue()));
.ifPresent(ar -> datasource.setAccessrights(ar.getValue()));
.ifPresent(dut -> datasource.setUploadrights(dut.getValue()));
.ifPresent(dar -> datasource.setDatabaseaccessrestriction(dar.getValue()));
.ifPresent(dur -> datasource.setDatauploadrestriction(dur.getValue()));
.ifPresent(v -> datasource.setVersioning(v.getValue()));
.ifPresent(cu -> datasource.setCitationguidelineurl(cu.getValue()));
.ifPresent(ps -> datasource.setPidsystems(ps.getValue()));
.ifPresent(c -> datasource.setCertificates(c.getValue()));
.ifPresent(ps -> datasource.setPolicies(ps.stream().map(p -> p.getValue()).collect(Collectors.toList())));
.ifPresent(j -> datasource.setJournal(getContainer(j)));
return datasource;
private static Container getContainer(Journal j) {
Container c = new Container();
.ifPresent(n -> c.setName(n));
.ifPresent(issnp -> c.setIssnPrinted(issnp));
.ifPresent(issno -> c.setIssnOnline(issno));
.ifPresent(isnl -> c.setIssnLinking(isnl));
.ifPresent(ep -> c.setEp(ep));
.ifPresent(iss -> c.setIss(iss));
.ifPresent(sp -> c.setSp(sp));
.ifPresent(vol -> c.setVol(vol));
.ifPresent(edition -> c.setEdition(edition));
.ifPresent(cdate -> c.setConferencedate(cdate));
.ifPresent(cplace -> c.setConferenceplace(cplace));
return c;
private static Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p) throws DocumentException {
Project project = new Project();
.ifPresent(id -> project.setId(id));
.ifPresent(w -> project.setWebsiteurl(w.getValue()));
.ifPresent(code -> project.setCode(code.getValue()));
.ifPresent(acronynim -> project.setAcronym(acronynim.getValue()));
.ifPresent(title -> project.setTitle(title.getValue()));
.ifPresent(sdate -> project.setStartdate(sdate.getValue()));
.ifPresent(edate -> project.setEnddate(edate.getValue()));
.ifPresent(cide -> project.setCallidentifier(cide.getValue()));
.ifPresent(key -> project.setKeywords(key.getValue()));
Optional<Field<String>> omandate = Optional.ofNullable(p.getOamandatepublications());
Optional<Field<String>> oecsc39 = Optional.ofNullable(p.getEcsc39());
boolean mandate = false;
if (omandate.isPresent()) {
if (omandate.get().getValue().equals("true")) {
mandate = true;
if (oecsc39.isPresent()) {
if (oecsc39.get().getValue().equals("true")) {
mandate = true;
.ifPresent(oamandate -> project.setOpenaccessmandatefordataset(oamandate.getValue().equals("true")));
.map(subjs -> subjs.stream().map(s -> s.getValue()).collect(Collectors.toList()))
.orElse(new ArrayList<>()));
.ifPresent(summary -> project.setSummary(summary.getValue()));
Optional<Float> ofundedamount = Optional.ofNullable(p.getFundedamount());
Optional<Field<String>> ocurrency = Optional.ofNullable(p.getCurrency());
Optional<Float> ototalcost = Optional.ofNullable(p.getTotalcost());
if (ocurrency.isPresent()) {
if (ofundedamount.isPresent()) {
if (ototalcost.isPresent()) {
Granted.newInstance(ocurrency.get().getValue(), ototalcost.get(), ofundedamount.get()));
} else {
project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ofundedamount.get()));
programme -> programme
.map(pg -> Programme.newInstance(pg.getCode(), pg.getDescription()))
.orElse(new ArrayList<>()));
Optional<List<Field<String>>> ofundTree = Optional
List<Funder> funList = new ArrayList<>();
if (ofundTree.isPresent()) {
for (Field<String> fundingtree : ofundTree.get()) {
return project;
public static Funder getFunder(String fundingtree) throws DocumentException {
Funder f = new Funder();
final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree));
f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
f.setName(((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText());
f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
// f.setId(((org.dom4j.Node) (doc.selectNodes("//funder/id").get(0))).getText());
String id = "";
String description = "";
// List<Levels> fundings = new ArrayList<>();
int level = 0;
List<org.dom4j.Node> nodes = doc.selectNodes("//funding_level_" + level);
while (nodes.size() > 0) {
for (org.dom4j.Node n : nodes) {
List node = n.selectNodes("./id");
id = ((org.dom4j.Node) node.get(0)).getText();
id = id.substring(id.indexOf("::") + 2);
node = n.selectNodes("./description");
description += ((Node) node.get(0)).getText() + " - ";
level += 1;
nodes = doc.selectNodes("//funding_level_" + level);
if (!id.equals("")) {
Fundings fundings = new Fundings();
fundings.setDescription(description.substring(0, description.length() - 3).trim());
return f;
private static <E extends OafEntity> void organizationMap(SparkSession spark, String inputPath, String outputPath,
Class<E> inputClazz) {
.readPath(spark, inputPath, inputClazz)
.map(o -> mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) o), Encoders.bean(Organization.class))
.option("compression", "gzip")
private static Organization mapOrganization(eu.dnetlib.dhp.schema.oaf.Organization org) {
Organization organization = new Organization();
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
.ifPresent(value -> organization.setLegalname(value.getValue()));
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
value -> organization
.map(v -> v.getValue())
value -> organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname())));
.ifPresent(value -> organization.setId(value));
value -> organization
.map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue()))
return organization;

View File

@ -0,0 +1,197 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Node;
import eu.dnetlib.dhp.schema.dump.oaf.graph.RelType;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
* Creates new Relations (as in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation) from the information in the Entity.
* The new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context
* related to communities and research initiative/infrastructures.
* For collectedfrom elements it creates: datasource -> provides -> result and result -> isProvidedBy -> datasource
* For hostedby elements it creates: datasource -> hosts -> result and result -> isHostedBy -> datasource
* For context elements it creates: context <-> isRelatedTo <-> result
public class Extractor implements Serializable {
public void run(Boolean isSparkSessionManaged,
String inputPath,
String outputPath,
Class<? extends Result> inputClazz,
String communityMapPath) {
SparkConf conf = new SparkConf();
spark -> {
Utils.removeOutputDir(spark, outputPath);
spark, inputPath, outputPath, inputClazz, Utils.getCommunityMap(spark, communityMapPath));
private <R extends Result> void extractRelationResult(SparkSession spark,
String inputPath,
String outputPath,
Class<R> inputClazz,
CommunityMap communityMap) {
Set<Integer> hashCodes = new HashSet<>();
.readPath(spark, inputPath, inputClazz)
.flatMap((FlatMapFunction<R, Relation>) value -> {
List<Relation> relationList = new ArrayList<>();
.ifPresent(inst -> inst.forEach(instance -> {
cf -> getRelatioPair(
value, relationList, cf,
ModelConstants.IS_PROVIDED_BY, ModelConstants.PROVIDES, hashCodes));
hb -> getRelatioPair(
value, relationList, hb,
Constants.IS_HOSTED_BY, Constants.HOSTS, hashCodes));
Set<String> communities = communityMap.keySet();
.ifPresent(contexts -> contexts.forEach(context -> {
String id = context.getId();
if (id.contains(":")) {
id = id.substring(0, id.indexOf(":"));
if (communities.contains(id)) {
String contextId = Utils.getContextId(id);
Provenance provenance = Optional
dinfo -> Optional
paction -> Provenance
Relation r = getRelation(
value.getId(), contextId,
ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, provenance);
if (!hashCodes.contains(r.hashCode())) {
r = getRelation(
contextId, value.getId(),
ModelConstants.IS_RELATED_TO, provenance);
if (!hashCodes.contains(r.hashCode())) {
return relationList.iterator();
}, Encoders.bean(Relation.class))
.option("compression", "gzip")
private static <R extends Result> void getRelatioPair(R value, List<Relation> relationList, KeyValue cf,
String result_dtasource, String datasource_result,
Set<Integer> hashCodes) {
Provenance provenance = Optional
dinfo -> Optional
paction -> Provenance
Relation r = getRelation(
cf.getKey(), Constants.RESULT_ENTITY, Constants.DATASOURCE_ENTITY,
result_dtasource, ModelConstants.PROVISION,
if (!hashCodes.contains(r.hashCode())) {
r = getRelation(
cf.getKey(), value.getId(),
datasource_result, ModelConstants.PROVISION,
if (!hashCodes.contains(r.hashCode())) {
private static Relation getRelation(String source, String target, String sourceType, String targetType,
String relName, String relType, Provenance provenance) {
Relation r = new Relation();
r.setSource(Node.newInstance(source, sourceType));
r.setTarget(Node.newInstance(target, targetType));
r.setReltype(RelType.newInstance(relName, relType));
return r;

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.Serializable;
public class MergedRels implements Serializable {
private String organizationId;
private String representativeId;
public String getOrganizationId() {
return organizationId;
public void setOrganizationId(String organizationId) {
this.organizationId = organizationId;
public String getRepresentativeId() {
return representativeId;
public void setRepresentativeId(String representativeId) {
this.representativeId = representativeId;

View File

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class OrganizationMap extends HashMap<String, List<String>> {
public OrganizationMap() {
public List<String> get(String key) {
if (super.get(key) == null) {
return new ArrayList<>();
return super.get(key);

View File

@ -0,0 +1,98 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
* It process the ContextInfo information to produce a new Context Entity or a set of Relations between the
* generic context entity and datasource/projects related to the context.
public class Process implements Serializable {
private static final Logger log = LoggerFactory.getLogger(Process.class);
public static <R extends ResearchInitiative> R getEntity(ContextInfo ci) {
try {
ResearchInitiative ri;
if (ci.getType().equals("community")) {
ri = new ResearchCommunity();
((ResearchCommunity) ri).setSubject(ci.getSubject());
} else {
ri = new ResearchInitiative();
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
return (R) ri;
} catch (final Exception e) {
throw new RuntimeException(e);
public static List<Relation> getRelation(ContextInfo ci) {
try {
List<Relation> relationList = new ArrayList<>();
.forEach(ds -> {
String nodeType = ModelSupport.idPrefixEntity.get(ds.substring(0, 2));
String contextId = Utils.getContextId(ci.getId());
contextId, eu.dnetlib.dhp.schema.dump.oaf.graph.Constants.CONTEXT_ENTITY),
Node.newInstance(ds, nodeType),
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
Node.newInstance(ds, nodeType),
contextId, eu.dnetlib.dhp.schema.dump.oaf.graph.Constants.CONTEXT_ENTITY),
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
return relationList;
} catch (final Exception e) {
throw new RuntimeException(e);

View File

@ -0,0 +1,132 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.StringReader;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.*;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.jetbrains.annotations.NotNull;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class QueryInformationSystem {
private ISLookUpService isLookUp;
private List<String> contextRelationResult;
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
" and $x//context/param[./@name = 'status']/text() = 'all' " +
" return " +
private static final String XQUERY_ENTITY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
"where $x//context[./@type='community' or ./@type = 'ri'] and $x//context/param[./@name = 'status']/text() = 'all' return "
"concat(data($x//context/@id) , '@@', $x//context/param[./@name =\"name\"]/text(), '@@', " +
"$x//context/param[./@name=\"description\"]/text(), '@@', $x//context/param[./@name = \"subject\"]/text(), '@@', "
"$x//context/param[./@name = \"zenodoCommunity\"]/text(), '@@', $x//context/@type)";
public void getContextInformation(final Consumer<ContextInfo> consumer) throws ISLookUpException {
.forEach(c -> {
ContextInfo cinfo = new ContextInfo();
String[] cSplit = c.split("@@");
if (!cSplit[3].trim().equals("")) {
public List<String> getContextRelationResult() {
return contextRelationResult;
public void setContextRelationResult(List<String> contextRelationResult) {
this.contextRelationResult = contextRelationResult;
public ISLookUpService getIsLookUp() {
return isLookUp;
public void setIsLookUp(ISLookUpService isLookUpService) {
this.isLookUp = isLookUpService;
public void execContextRelationQuery() throws ISLookUpException {
contextRelationResult = isLookUp.quickSearchProfile(XQUERY);
public void getContextRelation(final Consumer<ContextInfo> consumer, String category, String prefix) {
contextRelationResult.forEach(xml -> {
ContextInfo cinfo = new ContextInfo();
final Document doc;
try {
doc = new SAXReader().read(new StringReader(xml));
Element root = doc.getRootElement();
Iterator it = root.elementIterator();
while (it.hasNext()) {
Element el = (Element) it.next();
if (el.getName().equals("category")) {
String categoryId = el.attributeValue("id");
categoryId = categoryId.substring(categoryId.lastIndexOf("::") + 2);
if (categoryId.equals(category)) {
cinfo.setDatasourceList(getCategoryList(el, prefix));
} catch (DocumentException e) {
private List<String> getCategoryList(Element el, String prefix) {
List<String> datasourceList = new ArrayList<>();
for (Object node : el.selectNodes(".//param")) {
Node n = (Node) node;
if (n.valueOf("./@name").equals("openaireId")) {
datasourceList.add(prefix + "|" + n.getText());
return datasourceList;

View File

@ -0,0 +1,89 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
* Reads all the entities of the same type (Relation / Results) and saves them in the same folder
public class SparkCollectAndSave implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkCollectAndSave.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
spark -> {
Utils.removeOutputDir(spark, outputPath + "/result");
run(spark, inputPath, outputPath);
private static void run(SparkSession spark, String inputPath, String outputPath) {
.readPath(spark, inputPath + "/result/publication", Result.class)
.union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class))
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class))
.union(Utils.readPath(spark, inputPath + "/result/software", Result.class))
.option("compression", "gzip")
.json(outputPath + "/result");
.readPath(spark, inputPath + "/relation/publication", Relation.class)
.union(Utils.readPath(spark, inputPath + "/relation/dataset", Relation.class))
.union(Utils.readPath(spark, inputPath + "/relation/orp", Relation.class))
.union(Utils.readPath(spark, inputPath + "/relation/software", Relation.class))
.union(Utils.readPath(spark, inputPath + "/relation/contextOrg", Relation.class))
.union(Utils.readPath(spark, inputPath + "/relation/context", Relation.class))
.union(Utils.readPath(spark, inputPath + "/relation/relation", Relation.class))
.option("compression", "gzip")
.json(outputPath + "/relation");

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
* Spark Job that fires the dump for the entites
public class SparkDumpEntitiesJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpEntitiesJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String communityMapPath = parser.get("communityMapPath");
Class<? extends OafEntity> inputClazz = (Class<? extends OafEntity>) Class.forName(resultClassName);
DumpGraphEntities dg = new DumpGraphEntities();
dg.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMapPath);

View File

@ -0,0 +1,111 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Node;
import eu.dnetlib.dhp.schema.dump.oaf.graph.RelType;
import eu.dnetlib.dhp.schema.oaf.Relation;
* Dumps eu.dnetlib.dhp.schema.oaf.Relation in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation
public class SparkDumpRelationJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpRelationJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
spark -> {
Utils.removeOutputDir(spark, outputPath);
dumpRelation(spark, inputPath, outputPath);
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath) {
.readPath(spark, inputPath, Relation.class)
.map(relation -> {
eu.dnetlib.dhp.schema.dump.oaf.graph.Relation rel = new eu.dnetlib.dhp.schema.dump.oaf.graph.Relation();
ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2))));
ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2))));
datainfo -> rel
.newInstance(datainfo.getProvenanceaction().getClassname(), datainfo.getTrust())));
return rel;
}, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
.option("compression", "gzip")

View File

@ -0,0 +1,57 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.oaf.Result;
* Spark job that fires the extraction of relations from entities
public class SparkExtractRelationFromEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkExtractRelationFromEntities.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String communityMapPath = parser.get("communityMapPath");
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
Extractor extractor = new Extractor();
extractor.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMapPath);

View File

@ -0,0 +1,161 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.function.Consumer;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Node;
import eu.dnetlib.dhp.schema.dump.oaf.graph.RelType;
import eu.dnetlib.dhp.schema.oaf.Relation;
* Create new Relations between Context Entities and Organizations whose products are associated to the context.
* It produces relation such as: organization <-> isRelatedTo <-> context
public class SparkOrganizationRelation implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkOrganizationRelation.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
Boolean isSparkSessionManaged = Optional
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final OrganizationMap organizationMap = new Gson()
.fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class);
log.info("organization map : {}", new Gson().toJson(organizationMap));
SparkConf conf = new SparkConf();
spark -> {
Utils.removeOutputDir(spark, outputPath);
extractRelation(spark, inputPath, organizationMap, outputPath);
private static void extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap,
String outputPath) {
Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class);
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList = new ArrayList<>();
Dataset<MergedRels> mergedRelsDataset = spark
"SELECT target organizationId, source representativeId " +
"FROM relation " +
"WHERE datainfo.deletedbyinference = false " +
"AND relclass = 'merges' " +
"AND substr(source, 1, 2) = '20'")
mergedRelsDataset.map((MapFunction<MergedRels, MergedRels>) mergedRels -> {
if (organizationMap.containsKey(mergedRels.getOrganizationId())) {
return mergedRels;
return null;
}, Encoders.bean(MergedRels.class))
.forEach(getMergedRelsConsumer(organizationMap, relList));
oId -> organizationMap
.forEach(community -> addRelations(relList, community, oId)));
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
.option("compression", "gzip")
private static Consumer<MergedRels> getMergedRelsConsumer(OrganizationMap organizationMap,
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList) {
return mergedRels -> {
String oId = mergedRels.getOrganizationId();
.forEach(community -> addRelations(relList, community, mergedRels.getRepresentativeId()));
private static void addRelations(List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList, String community,
String organization) {
String id = Utils.getContextId(community);
log.info("create relation for organization: {}", organization);
Node.newInstance(id, Constants.CONTEXT_ENTITY),
Node.newInstance(organization, ModelSupport.idPrefixEntity.get(organization.substring(0, 2))),
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
Node.newInstance(organization, ModelSupport.idPrefixEntity.get(organization.substring(0, 2))),
Node.newInstance(id, Constants.CONTEXT_ENTITY),
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),

View File

@ -0,0 +1,25 @@
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
"paramDescription": "the name node",
"paramRequired": true
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true

View File

@ -0,0 +1,24 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
"paramName": "hdp",
"paramLongName": "hdfsPath",
"paramDescription": "the path used to store the output archive",
"paramRequired": true
"paramDescription": "the name node",
"paramRequired": true

View File

@ -0,0 +1,36 @@
"paramDescription": "the path to the serialization of the community map",
"paramRequired": true
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true

View File

@ -0,0 +1,30 @@

View File

@ -0,0 +1,431 @@
<workflow-app name="dump_community_products" xmlns="uri:oozie:workflow:0.5">
<description>the source path</description>
<description>the isLookup service endpoint</description>
<description>the output path</description>
<description>the access token used for the deposition in Zenodo</description>
<description>the connection url for Zenodo</description>
<description> the metadata associated to the deposition</description>
<description>true if it is a brand new depositon. false for new version of an old deposition</description>
<description>for new version, the id of the record for the old deposition</description>
<description>the target hive database name</description>
<description>hive server jdbc url</description>
<description>hive server metastore URIs</description>
<description>memory for driver process</description>
<description>memory for individual executor</description>
<description>number of cores used by single executor</description>
<description>oozie action sharelib for spark 2.*</description>
<description>spark 2.* extra listeners classname</description>
<description>spark 2.* sql query execution listeners classname</description>
<description>spark 2.* yarn history server address</description>
<description>spark 2.* event log dir location</description>
<start to="reset_outputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
<action name="reset_outputpath">
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
<ok to="save_community_map"/>
<error to="Kill"/>
<action name="save_community_map">
<ok to="fork_dump"/>
<error to="Kill"/>
<fork name="fork_dump">
<path start="dump_publication"/>
<path start="dump_dataset"/>
<path start="dump_orp"/>
<path start="dump_software"/>
<action name="dump_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Dump table publication for community related products</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="join_dump"/>
<error to="Kill"/>
<action name="dump_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Dump table dataset for community related products</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="join_dump"/>
<error to="Kill"/>
<action name="dump_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Dump table ORP for community related products</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="join_dump"/>
<error to="Kill"/>
<action name="dump_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Dump table software for community related products</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="join_dump"/>
<error to="Kill"/>
<join name="join_dump" to="prepareResultProject"/>
<action name="prepareResultProject">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Prepare association result subset of project info</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="fork_extendWithProject"/>
<error to="Kill"/>
<fork name="fork_extendWithProject">
<path start="extend_publication"/>
<path start="extend_dataset"/>
<path start="extend_orp"/>
<path start="extend_software"/>
<action name="extend_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Extend dumped publications with information about project</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="join_extend"/>
<error to="Kill"/>
<action name="extend_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Extend dumped dataset with information about project</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="join_extend"/>
<error to="Kill"/>
<action name="extend_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Extend dumped ORP with information about project</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="join_extend"/>
<error to="Kill"/>
<action name="extend_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Extend dumped software with information about project</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="join_extend"/>
<error to="Kill"/>
<join name="join_extend" to="splitForCommunities"/>
<action name="splitForCommunities">
<spark xmlns="uri:oozie:spark-action:0.2">
<name>Split dumped result for community</name>
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
<ok to="make_archive"/>
<error to="Kill"/>
<action name="make_archive">
<ok to="send_zenodo"/>
<error to="Kill"/>
<action name="send_zenodo">
<ok to="End"/>
<error to="Kill"/>
<end name="End"/>

View File

@ -0,0 +1,29 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
"paramName": "pip",
"paramLongName": "preparedInfoPath",
"paramDescription": "the path of the association result projectlist",
"paramRequired": true

View File

@ -0,0 +1,20 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false

View File

@ -0,0 +1,542 @@
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"AccessRight": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/"
"label": {
"type": "string",
"description": "Label for the access mode"
"scheme": {
"type": "string",
"description": "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/"
"ControlledField": {
"type": "object",
"properties": {
"scheme": {
"type": "string",
"description": "The scheme for the resource"
"value": {
"type": "string",
"description": "the value in the scheme"
"KeyValue": {
"type": "object",
"properties": {
"key": {
"type": "string",
"description": "Description of key"
"value": {
"type": "string",
"description": "Description of value"
"Provenance": {
"type": "object",
"properties": {
"provenance": {
"type": "string",
"description": "The provenance of the information"
"trust": {
"type": "string",
"description": "The trust associated to the information"
"type": "object",
"properties": {
"author": {
"description": "List of authors of the research results",
"type": "array",
"items": {
"type": "object",
"properties": {
"affiliation": {
"description": "Affiliations of the author",
"type": "array",
"items": {
"type": "string",
"description": "One of the affiliation of the author"
"fullname": {
"type": "string",
"description": "Fullname of the author"
"name": {
"type": "string",
"description": "First name of the author"
"pid": {
"type": "object",
"properties": {
"id": {
"allOf": [
{"$ref": "#/definitions/ControlledField"},
{"description": "The author's id and scheme. OpenAIRE currently supports 'ORCID'"}
"provenance": {
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "The provenance of the author's pid"}
"description": "Persistent identifier of the author (e.g. ORCID)"
"rank": {
"type": "integer",
"description": "Order in which the author appears in the authors list"
"surname": {
"type": "string",
"description": "Surname of the author"
"description": "One of the author of the research result"
"bestaccessright": {
"allOf": [
{"$ref": "#/definitions/AccessRight"},
{"description": "The openest access right associated to the manifestations of this research results"}
"codeRepositoryUrl": {
"type": "string",
"description": "Only for results with type 'software': the URL to the repository with the source code"
"collectedfrom": {
"description": "Information about the sources from which the record has been collected",
"type": "array",
"items": {
"allOf": [
{"$ref": "#/definitions/KeyValue"},
{"description": "Key is the OpenAIRE identifier of the data source, value is its name"}
"contactgroup": {
"description": "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource",
"type": "array",
"items": {
"type": "string"
"contactperson": {
"description": "Only for results with type 'software': Information on the person responsible for providing further information regarding the resource",
"type": "array",
"items": {
"type": "string"
"container": {
"type": "object",
"properties": {
"conferencedate": {
"type": "string",
"description": "Date of the conference"
"conferenceplace": {
"type": "string",
"description": "Place of the conference"
"edition": {
"type": "string",
"description": "Edition of the journal or conference proceeding"
"ep": {
"type": "string",
"description": "End page"
"iss": {
"type": "string",
"description": "Journal issue"
"issnLinking": {
"type": "string",
"description": "Journal linking iisn"
"issnOnline": {
"type": "string",
"description": "Journal online issn"
"issnPrinted": {
"type": "string",
"description": "Journal printed issn"
"name": {
"type": "string",
"description": "Name of the journal or conference"
"sp": {
"type": "string",
"description": "Start page"
"vol": {
"type": "string",
"description": "Volume"
"description": "Container has information about the conference or journal where the result has been presented or published"
"context": {
"description": "Reference to a relevant research infrastructure, initiative or community (RI/RC) among those collaborating with OpenAIRE. Please see https://connect.openaire.eu",
"type": "array",
"items": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "Code identifying the RI/RC"
"label": {
"type": "string",
"description": "Label of the RI/RC"
"provenance": {
"description": "Why this result is associated to the RI/RC.",
"type": "array",
"items": {
"allOf": [
{"$ref": "#/definitions/Provenance"}
"contributor": {
"description": "Contributors of this result",
"type": "array",
"items": {
"type": "string"
"country": {
"description": "Country associated to this result",
"type": "array",
"items": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "ISO 3166-1 alpha-2 country code"
"label": {
"type": "string",
"description": "English label of the country"
"provenance": {
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Why this result is associated to the country."}
"coverage": {
"type": "array",
"items": {
"type": "string"
"dateofcollection": {
"type": "string",
"description": "When OpenAIRE collected the record the last time"
"description": {
"type": "array",
"items": {
"type": "string"
"documentationUrl": {
"description": "Only for results with type 'software': URL to the software documentation",
"type": "array",
"items": {
"type": "string"
"embargoenddate": {
"type": "string",
"description": "Date when the embargo ends and this result turns Open Access"
"externalReference": {
"description": "Links to external resources like entries from thematic databases (e.g. Protein Data Bank)",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
"provenance": {
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Why this result is linked to the external resource"}
"typology": {
"type": "string"
"value": {
"type": "string"
"format": {
"type": "array",
"items": {
"type": "string"
"geolocation": {
"description": "Geolocation information",
"type": "array",
"items": {
"type": "object",
"properties": {
"box": {
"type": "string"
"place": {
"type": "string"
"point": {
"type": "string"
"id": {
"type": "string",
"description": "OpenAIRE identifier"
"instance": {
"description": "Manifestations (i.e. different versions) of the result. For example: the pre-print and the published versions are two manifestations of the same research result",
"type": "array",
"items": {
"type": "object",
"properties": {
"accessright": {
"allOf": [
{"$ref": "#/definitions/AccessRight"},
{"description": "Access right of this instance"}
"collectedfrom": {
"allOf": [
{"$ref": "#/definitions/KeyValue"},
{"description": "Information about the source from which the instance has been collected. Key is the OpenAIRE identifier of the data source, value is its name"}
"hostedby": {
"allOf": [
{"$ref": "#/definitions/KeyValue"},
{"description": "Information about the source from which the instance can be viewed or downloaded. Key is the OpenAIRE identifier of the data source, value is its name"}
"license": {
"type": "string",
"description": "License applied to the instance"
"publicationdate": {
"type": "string",
"description": "Publication date of the instance"
"refereed": {
"type": "string",
"description": "Was the instance subject to peer-review? Possible values are 'Unknown', 'nonPeerReviewed', 'peerReviewed' (see also https://api.openaire.eu/vocabularies/dnet:review_levels)"
"type": {
"type": "string",
"description": "Type of the instance. Possible values are listed at https://api.openaire.eu/vocabularies/dnet:publication_resource"
"url": {
"description":"Location where the instance is accessible",
"type": "array",
"items": {
"type": "string"
"language": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "alpha-3/ISO 639-2 code of the language"
"label": {
"type": "string",
"description": "English label"
"lastupdatetimestamp": {
"type": "integer",
"description": "Timestamp of last update of the record in OpenAIRE"
"maintitle": {
"type": "string",
"description": "Title"
"originalId": {
"description": "Identifiers of the record at the original sources",
"type": "array",
"items": {
"type": "string"
"pid": {
"description": "Persistent identifiers of the result",
"type": "array",
"items": {
"allOf": [
{"$ref": "#/definitions/ControlledField"},
{"description": "scheme: list of available schemes are at https://api.openaire.eu/vocabularies/dnet:pid_types, value: the PID of the result "}
"programmingLanguage": {
"type": "string",
"description": "Only for results with type 'software': the programming language"
"projects": {
"description": "List of projects (i.e. grants) that (co-)funded the production ofn the research results",
"type": "array",
"items": {
"type": "object",
"properties": {
"acronym": {
"type": "string",
"description": "Project acronym"
"code": {
"type": "string",
"description": "Grant code"
"funder": {
"type": "object",
"properties": {
"fundingStream": {
"type": "string",
"description": "Stream of funding (e.g. for European Commission can be H2020 or FP7)"
"jurisdiction": {
"type": "string",
"description": "Geographical jurisdiction (e.g. for European Commission is EU, for Croatian Science Foundation is HR)"
"name": {
"type": "string",
"description": "Name of the funder"
"shortName": {
"type": "string",
"description": "Short name or acronym of the funder"
"description": "Information about the funder funding the project"
"id": {
"type": "string",
"description": "OpenAIRE identifier of the project"
"provenance": {
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Why this project is associated to the result"}
"title": {
"type": "string",
"description": "Title of the project"
"publicationdate": {
"type": "string",
"description": "Date of publication"
"publisher": {
"type": "string",
"description": "Publisher"
"size": {
"type": "string",
"description": "Only for results with type 'dataset': the declared size of the dataset"
"source": {
"description": "See definition of Dublin Core field dc:source",
"type": "array",
"items": {
"type": "string"
"subjects": {
"description": "Keywords associated to the result",
"type": "array",
"items": {
"type": "object",
"properties": {
"provenance": {
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Why this subject is associated to the result"}
"subject": {
"allOf": [
{"$ref": "#/definitions/ControlledField"},
{"description": "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies) and value. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary). "}
"subtitle": {
"type": "string",
"description": "Sub-title of the result"
"tool": {
"description": "Only for results with type 'other': tool useful for the interpretation and/or re-used of the research product",
"type": "array",
"items": {
"type": "string"
"type": {
"type": "string",
"description": "Type of the result: one of 'publication', 'dataset', 'software', 'other' (see also https://api.openaire.eu/vocabularies/dnet:result_typologies)"
"version": {
"type": "string",
"description": "Version of the result"

View File

@ -0,0 +1,32 @@
"paramDescription": "the path to the serialization of the community map",
"paramRequired": false
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false

View File

@ -0,0 +1,51 @@
"paramDescription": "if it is a new deposition (true) or a new versione (false)",
"paramRequired": true
"paramDescription": "The id of the concept record for a new version",
"paramRequired": false
"paramDescription": "the path to the serialization of the community map",
"paramRequired": false
"paramDescription": "the path of the folder tofind files to send to Zenodo",
"paramRequired": true
"paramName": "hdfsnn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the name node",
"paramRequired": true
"paramName": "at",
"paramLongName": "accessToken",
"paramDescription": "the access token for the deposition",
"paramRequired": false
"paramDescription": "the url to connect to deposit",
"paramRequired": false
"paramDescription": "metadata associated to the deposition",
"paramRequired": false

View File

@ -0,0 +1,24 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false

View File

@ -0,0 +1,24 @@
"paramDescription": "URL of the isLookUp Service",
"paramRequired": false
"paramName": "hdfs",
"paramLongName": "hdfsPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
"paramName": "nn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the name node",
"paramRequired": true

Some files were not shown because too many files have changed in this diff Show More