1
0
Fork 0

Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi

This commit is contained in:
Enrico Ottonello 2020-11-10 11:49:43 +01:00
commit fea2451658
112 changed files with 2167 additions and 1865 deletions

View File

@ -0,0 +1,117 @@
package eu.dnetlib.dhp.common;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.hadoop.fs.*;
public class MakeTarArchive implements Serializable {
private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
return new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
}
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(inputPath), true);
while (fileStatusListIterator.hasNext()) {
writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, 0);
}
ar.close();
}
public static void tarMaxSize(FileSystem fileSystem, String inputPath, String outputPath, String dir_name,
int gBperSplit) throws IOException {
final long bytesPerSplit = 1024L * 1024L * 1024L * gBperSplit;
long sourceSize = fileSystem.getContentSummary(new Path(inputPath)).getSpaceConsumed();
if (sourceSize < bytesPerSplit) {
write(fileSystem, inputPath, outputPath + ".tar", dir_name);
} else {
int partNum = 0;
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(inputPath), true);
boolean next = fileStatusListIterator.hasNext();
while (next) {
TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar");
long current_size = 0;
while (next && current_size < bytesPerSplit) {
current_size = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, current_size);
next = fileStatusListIterator.hasNext();
}
partNum += 1;
ar.close();
}
}
}
private static long writeCurrentFile(FileSystem fileSystem, String dir_name,
RemoteIterator<LocatedFileStatus> fileStatusListIterator,
TarArchiveOutputStream ar, long current_size) throws IOException {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
if (name.trim().equalsIgnoreCase("communities_infrastructures")) {
name = "communities_infrastructures.json";
}
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
entry.setSize(fileStatus.getLen());
current_size += fileStatus.getLen();
ar.putArchiveEntry(entry);
InputStream is = fileSystem.open(fileStatus.getPath());
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte data[] = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}
bis.close();
ar.closeArchiveEntry();
}
return current_size;
}
}

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp.common.api;
import java.io.*; import java.io.*;
import java.io.IOException; import java.io.IOException;
import java.util.concurrent.TimeUnit;
import com.google.gson.Gson; import com.google.gson.Gson;
@ -50,14 +51,15 @@ public class ZenodoAPIClient implements Serializable {
/** /**
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload * Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
*
* @return response code * @return response code
* @throws IOException * @throws IOException
*/ */
public int newDeposition() throws IOException { public int newDeposition() throws IOException {
String json = "{}"; String json = "{}";
OkHttpClient httpClient = new OkHttpClient(); OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, json); RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString) .url(urlString)
@ -86,13 +88,18 @@ public class ZenodoAPIClient implements Serializable {
/** /**
* Upload files in Zenodo. * Upload files in Zenodo.
*
* @param is the inputStream for the file to upload * @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo * @param file_name the name of the file as it will appear on Zenodo
* @param len the size of the file * @param len the size of the file
* @return the response code * @return the response code
*/ */
public int uploadIS(InputStream is, String file_name, long len) throws IOException { public int uploadIS(InputStream is, String file_name, long len) throws IOException {
OkHttpClient httpClient = new OkHttpClient(); OkHttpClient httpClient = new OkHttpClient.Builder()
.writeTimeout(600, TimeUnit.SECONDS)
.readTimeout(600, TimeUnit.SECONDS)
.connectTimeout(600, TimeUnit.SECONDS)
.build();
Request request = new Request.Builder() Request request = new Request.Builder()
.url(bucket + "/" + file_name) .url(bucket + "/" + file_name)
@ -110,15 +117,16 @@ public class ZenodoAPIClient implements Serializable {
/** /**
* Associates metadata information to the current deposition * Associates metadata information to the current deposition
*
* @param metadata the metadata * @param metadata the metadata
* @return response code * @return response code
* @throws IOException * @throws IOException
*/ */
public int sendMretadata(String metadata) throws IOException { public int sendMretadata(String metadata) throws IOException {
OkHttpClient httpClient = new OkHttpClient(); OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, metadata); RequestBody body = RequestBody.create(metadata, MEDIA_TYPE_JSON);
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString + "/" + deposition_id) .url(urlString + "/" + deposition_id)
@ -140,6 +148,7 @@ public class ZenodoAPIClient implements Serializable {
/** /**
* To publish the current deposition. It works for both new deposition or new version of an old deposition * To publish the current deposition. It works for both new deposition or new version of an old deposition
*
* @return response code * @return response code
* @throws IOException * @throws IOException
*/ */
@ -147,12 +156,14 @@ public class ZenodoAPIClient implements Serializable {
String json = "{}"; String json = "{}";
OkHttpClient httpClient = new OkHttpClient(); OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/publish") .url(urlString + "/" + deposition_id + "/actions/publish")
.addHeader("Authorization", "Bearer " + access_token) .addHeader("Authorization", "Bearer " + access_token)
.post(RequestBody.create(MEDIA_TYPE_JSON, json)) .post(body)
.build(); .build();
try (Response response = httpClient.newCall(request).execute()) { try (Response response = httpClient.newCall(request).execute()) {
@ -166,11 +177,12 @@ public class ZenodoAPIClient implements Serializable {
} }
/** /**
* To create a new version of an already published deposition. * To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
* It sets the deposition_id and the bucket to be used for the new version. * for the new version.
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is *
* the last part of the url for the DOI Zenodo suggests to use to cite all versions: * @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
* DOI: 10.xxx/zenodo.656930 concept_rec_id = 656930 * part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930
* @return response code * @return response code
* @throws IOException * @throws IOException
* @throws MissingConceptDoiException * @throws MissingConceptDoiException
@ -179,12 +191,14 @@ public class ZenodoAPIClient implements Serializable {
setDepositionId(concept_rec_id); setDepositionId(concept_rec_id);
String json = "{}"; String json = "{}";
OkHttpClient httpClient = new OkHttpClient(); OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/newversion") .url(urlString + "/" + deposition_id + "/actions/newversion")
.addHeader("Authorization", "Bearer " + access_token) .addHeader("Authorization", "Bearer " + access_token)
.post(RequestBody.create(MEDIA_TYPE_JSON, json)) .post(body)
.build(); .build();
try (Response response = httpClient.newCall(request).execute()) { try (Response response = httpClient.newCall(request).execute()) {
@ -201,6 +215,41 @@ public class ZenodoAPIClient implements Serializable {
} }
} }
/**
* To finish uploading a version or new deposition not published
* It sets the deposition_id and the bucket to be used
*
*
* @param deposition_id the deposition id of the not yet published upload
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
this.deposition_id = deposition_id;
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Authorization", "Bearer " + access_token)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return response.code();
}
}
private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException { private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException {
ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class); ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class);
@ -217,7 +266,7 @@ public class ZenodoAPIClient implements Serializable {
} }
private String getPrevDepositions() throws IOException { private String getPrevDepositions() throws IOException {
OkHttpClient httpClient = new OkHttpClient(); OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString) .url(urlString)
@ -238,7 +287,9 @@ public class ZenodoAPIClient implements Serializable {
} }
private String getBucket(String url) throws IOException { private String getBucket(String url) throws IOException {
OkHttpClient httpClient = new OkHttpClient(); OkHttpClient httpClient = new OkHttpClient.Builder()
.connectTimeout(600, TimeUnit.SECONDS)
.build();
Request request = new Request.Builder() Request request = new Request.Builder()
.url(url) .url(url)

View File

@ -19,6 +19,30 @@ public class ZenodoAPIClientTest {
private final String CONCEPT_REC_ID = "657113"; private final String CONCEPT_REC_ID = "657113";
private final String depositionId = "674915";
@Test
public void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
}
@Test @Test
public void testNewDeposition() throws IOException { public void testNewDeposition() throws IOException {

View File

@ -39,15 +39,15 @@ public class ModelConstants {
public static final String IS_SUPPLEMENT_TO = "isSupplementTo"; public static final String IS_SUPPLEMENT_TO = "isSupplementTo";
public static final String IS_SUPPLEMENTED_BY = "isSupplementedBy"; public static final String IS_SUPPLEMENTED_BY = "isSupplementedBy";
public static final String PART = "part"; public static final String PART = "part";
public static final String IS_PART_OF = "IsPartOf"; public static final String IS_PART_OF = "isPartOf";
public static final String HAS_PARTS = "HasParts"; public static final String HAS_PARTS = "hasParts";
public static final String RELATIONSHIP = "relationship"; public static final String RELATIONSHIP = "relationship";
public static final String CITATION = "citation"; public static final String CITATION = "citation";
public static final String CITES = "cites"; public static final String CITES = "cites";
public static final String IS_CITED_BY = "IsCitedBy"; public static final String IS_CITED_BY = "isCitedBy";
public static final String REVIEW = "review"; public static final String REVIEW = "review";
public static final String REVIEWS = "reviews"; public static final String REVIEWS = "reviews";
public static final String IS_REVIEWED_BY = "IsReviewedBy"; public static final String IS_REVIEWED_BY = "isReviewedBy";
public static final String RESULT_PROJECT = "resultProject"; public static final String RESULT_PROJECT = "resultProject";
public static final String OUTCOME = "outcome"; public static final String OUTCOME = "outcome";

View File

@ -10,17 +10,11 @@ import java.util.List;
* String to store the license applied to the instance. It corresponds to the value of the licence in the instance to be * String to store the license applied to the instance. It corresponds to the value of the licence in the instance to be
* dumped - accessright of type eu.dnetlib.dhp.schema.dump.oaf.AccessRight to store the accessright of the instance. - * dumped - accessright of type eu.dnetlib.dhp.schema.dump.oaf.AccessRight to store the accessright of the instance. -
* type of type String to store the type of the instance as defined in the corresponding dnet vocabulary * type of type String to store the type of the instance as defined in the corresponding dnet vocabulary
* (dnet:pubication_resource). It corresponds to the instancetype.classname of the instance to be mapped - hostedby of * (dnet:pubication_resource). It corresponds to the instancetype.classname of the instance to be mapped - url of type
* type eu.dnetlib.dhp.schema.dump.oaf.KeyValue to store the information about the source from which the instance can be * List<String> list of locations where the instance is accessible. It corresponds to url of the instance to be dumped -
* viewed or downloaded. It is mapped against the hostedby parameter of the instance to be dumped and - key corresponds * publicationdate of type String to store the publication date of the instance ;// dateofacceptance; - refereed of type
* to hostedby.key - value corresponds to hostedby.value - url of type List<String> list of locations where the instance * String to store information abour tthe review status of the instance. Possible values are 'Unknown',
* is accessible. It corresponds to url of the instance to be dumped - collectedfrom of type * 'nonPeerReviewed', 'peerReviewed'. It corresponds to refereed.classname of the instance to be dumped
* eu.dnetlib.dhp.schema.dump.oaf.KeyValue to store the information about the source from which the instance has been
* collected. It is mapped against the collectedfrom parameter of the instance to be dumped and - key corresponds to
* collectedfrom.key - value corresponds to collectedfrom.value - publicationdate of type String to store the
* publication date of the instance ;// dateofacceptance; - refereed of type String to store information abour tthe
* review status of the instance. Possible values are 'Unknown', 'nonPeerReviewed', 'peerReviewed'. It corresponds to
* refereed.classname of the instance to be dumped
*/ */
public class Instance implements Serializable { public class Instance implements Serializable {
@ -30,12 +24,8 @@ public class Instance implements Serializable {
private String type; private String type;
private KeyValue hostedby;
private List<String> url; private List<String> url;
private KeyValue collectedfrom;
private String publicationdate;// dateofacceptance; private String publicationdate;// dateofacceptance;
private String refereed; // peer-review status private String refereed; // peer-review status
@ -64,14 +54,6 @@ public class Instance implements Serializable {
this.type = type; this.type = type;
} }
public KeyValue getHostedby() {
return hostedby;
}
public void setHostedby(KeyValue hostedby) {
this.hostedby = hostedby;
}
public List<String> getUrl() { public List<String> getUrl() {
return url; return url;
} }
@ -80,14 +62,6 @@ public class Instance implements Serializable {
this.url = url; this.url = url;
} }
public KeyValue getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(KeyValue collectedfrom) {
this.collectedfrom = collectedfrom;
}
public String getPublicationdate() { public String getPublicationdate() {
return publicationdate; return publicationdate;
} }

View File

@ -3,6 +3,12 @@ package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable; import java.io.Serializable;
/**
* This class to store the common information about the project that will be dumped for community and for the whole
* graph - private String id to store the id of the project (OpenAIRE id) - private String code to store the grant
* agreement of the project - private String acronym to store the acronym of the project - private String title to store
* the tile of the project
*/
public class Project implements Serializable { public class Project implements Serializable {
protected String id;// OpenAIRE id protected String id;// OpenAIRE id
protected String code; protected String code;

View File

@ -34,34 +34,32 @@ import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
* to the list of coverage.value in the result represented in the internal model - bestaccessright of type * to the list of coverage.value in the result represented in the internal model - bestaccessright of type
* eu.dnetlib.dhp.schema.dump.oaf.AccessRight to store informatin about the openest access right associated to the * eu.dnetlib.dhp.schema.dump.oaf.AccessRight to store informatin about the openest access right associated to the
* manifestations of this research results. It corresponds to the same parameter in the result represented in the * manifestations of this research results. It corresponds to the same parameter in the result represented in the
* internal model - instance of type List<eu.dnetlib.dhp.schema.dump.oaf.Instance> to store all the instances associated * internal model - container of type eu.dnetlib.dhp.schema/dump.oaf.Container (only for result of type publication). It
* to the result. It corresponds to the same parameter in the result represented in the internal model - container of * corresponds to the parameter journal of the result represented in the internal model - documentationUrl of type
* type eu.dnetlib.dhp.schema/dump.oaf.Container (only for result of type publication). It corresponds to the parameter * List<String> (only for results of type software) to store the URLs to the software documentation. It corresponds to
* journal of the result represented in the internal model - documentationUrl of type List<String> (only for results of * the list of documentationUrl.value of the result represented in the internal model - codeRepositoryUrl of type String
* type software) to store the URLs to the software documentation. It corresponds to the list of documentationUrl.value * (only for results of type software) to store the URL to the repository with the source code. It corresponds to
* of the result represented in the internal model - codeRepositoryUrl of type String (only for results of type * codeRepositoryUrl.value of the result represented in the internal model - programmingLanguage of type String (only
* software) to store the URL to the repository with the source code. It corresponds to codeRepositoryUrl.value of the * for results of type software) to store the programming language. It corresponds to programmingLanguaga.classid of the
* result represented in the internal model - programmingLanguage of type String (only for results of type software) to * result represented in the internal model - contactperson of type List<String> (only for results of type other) to
* store the programming language. It corresponds to programmingLanguaga.classid of the result represented in the * store the contact person for this result. It corresponds to the list of contactperson.value of the result represented
* internal model - contactperson of type List<String> (only for results of type other) to store the contact person for * in the internal model - contactgroup of type List<String> (only for results of type other) to store the information
* this result. It corresponds to the list of contactperson.value of the result represented in the internal model - * for the contact group. It corresponds to the list of contactgroup.value of the result represented in the internal
* contactgroup of type List<String> (only for results of type other) to store the information for the contact group. It * model - tool of type List<String> (only fro results of type other) to store information about tool useful for the
* corresponds to the list of contactgroup.value of the result represented in the internal model - tool of type * interpretation and/or re-used of the research product. It corresponds to the list of tool.value in the result
* List<String> (only fro results of type other) to store information about tool useful for the interpretation and/or * represented in the internal modelt - size of type String (only for results of type dataset) to store the size of the
* re-used of the research product. It corresponds to the list of tool.value in the result represented in the internal * dataset. It corresponds to size.value in the result represented in the internal model - version of type String (only
* modelt - size of type String (only for results of type dataset) to store the size of the dataset. It corresponds to * for results of type dataset) to store the version. It corresponds to version.value of the result represented in the
* size.value in the result represented in the internal model - version of type String (only for results of type * internal model - geolocation fo type List<eu.dnetlib.dhp.schema.dump.oaf.GeoLocation> (only for results of type
* dataset) to store the version. It corresponds to version.value of the result represented in the internal model - * dataset) to store geolocation information. For each geolocation element in the result represented in the internal
* geolocation fo type List<eu.dnetlib.dhp.schema.dump.oaf.GeoLocation> (only for results of type dataset) to store * model a GeoLocation in the external model il produced - id of type String to store the OpenAIRE id of the result. It
* geolocation information. For each geolocation element in the result represented in the internal model a GeoLocation * corresponds to the id of the result represented in the internal model - originalId of type List<String> to store the
* in the external model il produced - id of type String to store the OpenAIRE id of the result. It corresponds to the * original ids of the result. It corresponds to the originalId of the result represented in the internal model - pid of
* id of the result represented in the internal model - originalId of type List<String> to store the original ids of the * type List<eu.dnetlib.dhp.schema.dump.oaf.ControlledField> to store the persistent identifiers for the result. For
* result. It corresponds to the originalId of the result represented in the internal model - pid of type * each pid in the results represented in the internal model one pid in the external model is produced. The value
* List<eu.dnetlib.dhp.schema.dump.oaf.ControlledField> to store the persistent identifiers for the result. For each pid * correspondence is: - scheme corresponds to pid.qualifier.classid of the result represented in the internal model -
* in the results represented in the internal model one pid in the external model is produced. The value correspondence * value corresponds to the pid.value of the result represented in the internal model - dateofcollection of type String
* is: - scheme corresponds to pid.qualifier.classid of the result represented in the internal model - value corresponds * to store information about the time OpenAIRE collected the record. It corresponds to dateofcollection of the result
* to the pid.value of the result represented in the internal model - dateofcollection of type String to store
* information about the time OpenAIRE collected the record. It corresponds to dateofcollection of the result
* represented in the internal model - lasteupdatetimestamp of type String to store the timestamp of the last update of * represented in the internal model - lasteupdatetimestamp of type String to store the timestamp of the last update of
* the record. It corresponds to lastupdatetimestamp of the resord represented in the internal model * the record. It corresponds to lastupdatetimestamp of the resord represented in the internal model
*/ */
@ -101,8 +99,6 @@ public class Result implements Serializable {
private AccessRight bestaccessright; private AccessRight bestaccessright;
private List<Instance> instance;
private Container container;// Journal private Container container;// Journal
private List<String> documentationUrl; // software private List<String> documentationUrl; // software
@ -309,14 +305,6 @@ public class Result implements Serializable {
this.bestaccessright = bestaccessright; this.bestaccessright = bestaccessright;
} }
public List<Instance> getInstance() {
return instance;
}
public void setInstance(List<Instance> instance) {
this.instance = instance;
}
public List<String> getDocumentationUrl() { public List<String> getDocumentationUrl() {
return documentationUrl; return documentationUrl;
} }

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.schema.dump.oaf.community;
import eu.dnetlib.dhp.schema.dump.oaf.Instance;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
/**
* It extends eu.dnetlib.dhp.dump.oaf.Instance with values related to the community dump. In the Result dump this
* information is not present because it is dumped as a set of relations between the result and the datasource. -
* hostedby of type eu.dnetlib.dhp.schema.dump.oaf.KeyValue to store the information about the source from which the
* instance can be viewed or downloaded. It is mapped against the hostedby parameter of the instance to be dumped and -
* key corresponds to hostedby.key - value corresponds to hostedby.value - collectedfrom of type
* eu.dnetlib.dhp.schema.dump.oaf.KeyValue to store the information about the source from which the instance has been
* collected. It is mapped against the collectedfrom parameter of the instance to be dumped and - key corresponds to
* collectedfrom.key - value corresponds to collectedfrom.value
*/
public class CommunityInstance extends Instance {
private KeyValue hostedby;
private KeyValue collectedfrom;
public KeyValue getHostedby() {
return hostedby;
}
public void setHostedby(KeyValue hostedby) {
this.hostedby = hostedby;
}
public KeyValue getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(KeyValue collectedfrom) {
this.collectedfrom = collectedfrom;
}
}

View File

@ -10,11 +10,13 @@ import eu.dnetlib.dhp.schema.dump.oaf.Result;
* extends eu.dnetlib.dhp.schema.dump.oaf.Result with the following parameters: - projects of type * extends eu.dnetlib.dhp.schema.dump.oaf.Result with the following parameters: - projects of type
* List<eu.dnetlib.dhp.schema.dump.oaf.community.Project> to store the list of projects related to the result. The * List<eu.dnetlib.dhp.schema.dump.oaf.community.Project> to store the list of projects related to the result. The
* information is added after the result is mapped to the external model - context of type * information is added after the result is mapped to the external model - context of type
* List<eu.dnetlib.dhp.schema/dump.oaf.community.Context> to store information about the RC RI related to the result. * List<eu.dnetlib.dhp.schema.dump.oaf.community.Context> to store information about the RC RI related to the result.
* For each context in the result represented in the internal model one context in the external model is produced - * For each context in the result represented in the internal model one context in the external model is produced -
* collectedfrom of type List<eu.dnetliv.dhp.schema.dump.oaf.KeyValue> to store information about the sources from which * collectedfrom of type List<eu.dnetliv.dhp.schema.dump.oaf.KeyValue> to store information about the sources from which
* the record has been collected. For each collectedfrom in the result represented in the internal model one * the record has been collected. For each collectedfrom in the result represented in the internal model one
* collectedfrom in the external model is produced * collectedfrom in the external model is produced - instance of type
* List<eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance> to store all the instances associated to the result.
* It corresponds to the same parameter in the result represented in the internal model
*/ */
public class CommunityResult extends Result { public class CommunityResult extends Result {
@ -24,6 +26,16 @@ public class CommunityResult extends Result {
protected List<KeyValue> collectedfrom; protected List<KeyValue> collectedfrom;
private List<CommunityInstance> instance;
public List<CommunityInstance> getInstance() {
return instance;
}
public void setInstance(List<CommunityInstance> instance) {
this.instance = instance;
}
public List<KeyValue> getCollectedfrom() { public List<KeyValue> getCollectedfrom() {
return collectedfrom; return collectedfrom;
} }

View File

@ -9,39 +9,10 @@ import java.io.Serializable;
* (e.c. Akademy of Finland) - fundingStream of type String to store the funding stream - jurisdiction of type String to * (e.c. Akademy of Finland) - fundingStream of type String to store the funding stream - jurisdiction of type String to
* store the jurisdiction of the funder * store the jurisdiction of the funder
*/ */
public class Funder implements Serializable { public class Funder extends eu.dnetlib.dhp.schema.dump.oaf.Funder {
private String shortName;
private String name;
private String fundingStream; private String fundingStream;
private String jurisdiction;
public String getJurisdiction() {
return jurisdiction;
}
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
}
public String getShortName() {
return shortName;
}
public void setShortName(String shortName) {
this.shortName = shortName;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getFundingStream() { public String getFundingStream() {
return fundingStream; return fundingStream;
} }

View File

@ -8,21 +8,12 @@ import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
/** /**
* To store information about the project related to the result. This information is not directly mapped from the result * To store information about the project related to the result. This information is not directly mapped from the result
* represented in the internal model because it is not there. The mapped result will be enriched with project * represented in the internal model because it is not there. The mapped result will be enriched with project
* information derived by relation between results and projects. Project class has the following parameters: - id of * information derived by relation between results and projects. Project extends eu.dnetlib.dhp.schema.dump.oaf.Project
* type String to store the OpenAIRE id for the Project - code of type String to store the grant agreement - acronym of * with the following parameters: - funder of type eu.dnetlib.dhp.schema.dump.oaf.community.Funder to store information
* type String to store the acronym for the project - title of type String to store the title of the project - funder of * about the funder funding the project - provenance of type eu.dnetlib.dhp.schema.dump.oaf.Provenance to store
* type eu.dnetlib.dhp.schema.dump.oaf.community.Funder to store information about the funder funding the project - * information about the. provenance of the association between the result and the project
* provenance of type eu.dnetlib.dhp.schema.dump.oaf.Provenance to store information about the. provenance of the
* association between the result and the project
*/ */
public class Project implements Serializable { public class Project extends eu.dnetlib.dhp.schema.dump.oaf.Project {
private String id;// OpenAIRE id
private String code;
private String acronym;
private String title;
private Funder funder; private Funder funder;
@ -36,38 +27,6 @@ public class Project implements Serializable {
this.provenance = provenance; this.provenance = provenance;
} }
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getAcronym() {
return acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Funder getFunder() { public Funder getFunder() {
return funder; return funder;
} }

View File

@ -4,46 +4,14 @@ package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable; import java.io.Serializable;
/** /**
* To store information about the funder funding the project related to the result. It has the following parameters: * To store information about the funder funding the project related to the result. It extends
* - private String shortName to store the short name of the funder (e.g. AKA) * eu.dnetlib.dhp.schema.dump.oaf.Funder with the following parameter: - - private
* - private String name to store information about the name of the funder (e.g. Akademy of Finland) * eu.dnetdlib.dhp.schema.dump.oaf.graph.Fundings funding_stream to store the fundingstream
* - private Fundings funding_stream to store the fundingstream
* - private String jurisdiction to store information about the jurisdiction of the funder
*/ */
public class Funder implements Serializable { public class Funder extends eu.dnetlib.dhp.schema.dump.oaf.Funder {
private String shortName;
private String name;
private Fundings funding_stream; private Fundings funding_stream;
private String jurisdiction;
public String getShortName() {
return shortName;
}
public void setShortName(String shortName) {
this.shortName = shortName;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getJurisdiction() {
return jurisdiction;
}
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
}
public Fundings getFunding_stream() { public Fundings getFunding_stream() {
return funding_stream; return funding_stream;
} }

View File

@ -4,13 +4,13 @@ package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable; import java.io.Serializable;
/** /**
* To store inforamtion about the funding stream. It has two parameters: * To store inforamtion about the funding stream. It has two parameters: - private String id to store the id of the
* - private String id to store the id of the fundings stream. The id is created by appending the shortname of the * fundings stream. The id is created by appending the shortname of the funder to the name of each level in the xml
* funder to the name of each level in the xml representing the fundng stream. For example: if the funder is the * representing the fundng stream. For example: if the funder is the European Commission, the funding level 0 name is
* European Commission, the funding level 0 name is FP7, the funding level 1 name is SP3 and the funding level 2 name is * FP7, the funding level 1 name is SP3 and the funding level 2 name is PEOPLE then the id will be: EC::FP7::SP3::PEOPLE
* PEOPLE then the id will be: EC::FP7::SP3::PEOPLE * - private String description to describe the funding stream. It is created by concatenating the description of each
* - private String description to describe the funding stream. It is created by concatenating the description of each funding * funding level so for the example above the description would be: SEVENTH FRAMEWORK PROGRAMME - SP3-People -
* level so for the example above the description would be: SEVENTH FRAMEWORK PROGRAMME - SP3-People - Marie-Curie Actions * Marie-Curie Actions
*/ */
public class Fundings implements Serializable { public class Fundings implements Serializable {

View File

@ -5,10 +5,9 @@ import java.io.Serializable;
import java.util.Optional; import java.util.Optional;
/** /**
* To describe the funded amount. It has the following parameters: * To describe the funded amount. It has the following parameters: - private String currency to store the currency of
* - private String currency to store the currency of the fund * the fund - private float totalcost to store the total cost of the project - private float fundedamount to store the
* - private float totalcost to store the total cost of the project * funded amount by the funder
* - private float fundedamount to store the funded amount by the funder
*/ */
public class Granted implements Serializable { public class Granted implements Serializable {
private String currency; private String currency;

View File

@ -0,0 +1,24 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.Instance;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
/**
* It extends the eu.dnetlib.dhp.schema.dump.oaf.Result with - instance of type
* List<eu.dnetlib.dhp.schema.dump.oaf.Instance> to store all the instances associated to the result. It corresponds to
* the same parameter in the result represented in the internal model
*/
public class GraphResult extends Result {
private List<Instance> instance;
public List<Instance> getInstance() {
return instance;
}
public void setInstance(List<Instance> instance) {
this.instance = instance;
}
}

View File

@ -4,13 +4,10 @@ package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable; import java.io.Serializable;
/** /**
* To represent the generic node in a relation. It has the following parameters: * To represent the generic node in a relation. It has the following parameters: - private String id the openaire id of
* - private String id the openaire id of the entity in the relation * the entity in the relation - private String type the type of the entity in the relation. Consider the generic
* - private String type the type of the entity in the relation. * relation between a Result R and a Project P, the node representing R will have as id the id of R and as type result,
* * while the node representing the project will have as id the id of the project and as type project
* Consider the generic relation between a Result R and a Project P, the node representing R will have
* as id the id of R and as type result, while the node representing the project will have as id the id of the project
* and as type project
*/ */
public class Node implements Serializable { public class Node implements Serializable {
private String id; private String id;

View File

@ -11,14 +11,12 @@ import eu.dnetlib.dhp.schema.dump.oaf.Qualifier;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project; import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
/** /**
* To represent the generic organizaiton. It has the following parameters: * To represent the generic organizaiton. It has the following parameters: - private String legalshortname to store the
* - private String legalshortname to store the legalshortname of the organizaiton * legalshortname of the organizaiton - private String legalname to store the legal name of the organization - private
* - private String legalname to store the legal name of the organization * String websiteurl to store the websiteurl of the organization - private List<String> alternativenames to store the
* - private String websiteurl to store the websiteurl of the organization * alternative names of the organization - private Qualifier country to store the country of the organization - private
* - private List<String> alternativenames to store the alternative names of the organization * String id to store the id of the organization - private List<ControlledField> pid to store the list of pids for the
* - private Qualifier country to store the country of the organization * organization
* - private String id to store the id of the organization
* - private List<ControlledField> pid to store the list of pids for the organization
*/ */
public class Organization implements Serializable { public class Organization implements Serializable {
private String legalshortname; private String legalshortname;

View File

@ -4,9 +4,8 @@ package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable; import java.io.Serializable;
/** /**
* To store information about the ec programme for the project. It has the following parameters: * To store information about the ec programme for the project. It has the following parameters: - private String code
* - private String code to store the code of the programme * to store the code of the programme - private String description to store the description of the programme
* - private String description to store the description of the programme
*/ */
public class Programme implements Serializable { public class Programme implements Serializable {
private String code; private String code;

View File

@ -31,7 +31,7 @@ import java.util.List;
* - private List<Funder> funding to store the list of funder of the project * - private List<Funder> funding to store the list of funder of the project
* - private String summary to store the summary of the project * - private String summary to store the summary of the project
* - private Granted granted to store the granted amount * - private Granted granted to store the granted amount
* - private List<H2020Classification> h2020classification to store the list of H2020 classifications the project is related to * - private List<Programme> h2020programme to store the list of programmes the project is related to
*/ */
public class Project implements Serializable { public class Project implements Serializable {
@ -60,7 +60,7 @@ public class Project implements Serializable {
private Granted granted; private Granted granted;
private List<H2020Classification> h2020Classifications; private List<Programme> h2020programme;
public String getId() { public String getId() {
return id; return id;
@ -182,11 +182,11 @@ public class Project implements Serializable {
this.granted = granted; this.granted = granted;
} }
public List<H2020Classification> getH2020Classifications() { public List<Programme> getH2020programme() {
return h2020Classifications; return h2020programme;
} }
public void setH2020Classifications(List<H2020Classification> h2020Classifications) { public void setH2020programme(List<Programme> h2020programme) {
this.h2020Classifications = h2020Classifications; this.h2020programme = h2020programme;
} }
} }

View File

@ -4,12 +4,11 @@ package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable; import java.io.Serializable;
/** /**
* To represent the semantics of the generic relation between two entities. It has the following parameters: * To represent the semantics of the generic relation between two entities. It has the following parameters: - private
* - private String name to store the semantics of the relation (i.e. isAuthorInstitutionOf). It corresponds to the * String name to store the semantics of the relation (i.e. isAuthorInstitutionOf). It corresponds to the relclass
* relclass parameter in the relation represented in the internal model * parameter in the relation represented in the internal model represented in the internal model - private String type
* represented in the internal model * to store the type of the relation (i.e. affiliation). It corresponds to the subreltype parameter of the relation
* - private String type to store the type of the relation (i.e. affiliation). It corresponds to the subreltype parameter * represented in theinternal model
* of the relation represented in theinternal model
*/ */
public class RelType implements Serializable { public class RelType implements Serializable {
private String name; // relclass private String name; // relclass

View File

@ -7,11 +7,10 @@ import java.util.Objects;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance; import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
/** /**
* To represent the gereric relation between two entities. It has the following parameters: * To represent the gereric relation between two entities. It has the following parameters: - private Node source to
* - private Node source to represent the entity source of the relation * represent the entity source of the relation - private Node target to represent the entity target of the relation -
* - private Node target to represent the entity target of the relation * private RelType reltype to represent the semantics of the relation - private Provenance provenance to represent the
* - private RelType reltype to represent the semantics of the relation * provenance of the relation
* - private Provenance provenance to represent the provenance of the relation
*/ */
public class Relation implements Serializable { public class Relation implements Serializable {
private Node source; private Node source;

View File

@ -5,13 +5,17 @@ import java.io.Serializable;
/** /**
* To represent entity of type RC/RI. It has the following parameters, which are mostly derived by the profile * To represent entity of type RC/RI. It has the following parameters, which are mostly derived by the profile
* - private String id to store the openaire id for the entity. Is has as code 00 and will be created as * - private
* 00|context_____::md5(originalId) * String id to store the openaire id for the entity. Is has as code 00 and will be created as
* private String originalId to store the id of the context as provided in the profile (i.e. mes) * 00|context_____::md5(originalId) private
* private String name to store the name of the context (got from the label attribute in the context definition) * String originalId to store the id of the context as provided in the profile
* private String type to store the type of the context (i.e.: research initiative or research community) * (i.e. mes)
* private String description to store the description of the context as given in the profile * - private String name to store the name of the context (got from the label attribute in the context
* private String zenodo_community to store the zenodo community associated to the context (main zenodo community) * definition)
* - private String type to store the type of the context (i.e.: research initiative or research community)
* - private String description to store the description of the context as given in the profile
* -private String
* zenodo_community to store the zenodo community associated to the context (main zenodo community)
*/ */
public class ResearchInitiative implements Serializable { public class ResearchInitiative implements Serializable {
private String id; // openaireId private String id; // openaireId

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.project;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
@ -11,6 +12,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -175,43 +177,54 @@ public class PrepareProgramme {
return csvProgramme; return csvProgramme;
}); });
prepareClassification(h2020Programmes); // prepareClassification(h2020Programmes);
h2020Programmes JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme))
JavaRDD<CSVProgramme> rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1);
rdd
.map(csvProgramme -> {
String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme);
return tmp;
})
.saveAsTextFile(outputPath); .saveAsTextFile(outputPath);
} }
private static void prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) { private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
Object[] codedescription = h2020Programmes Object[] codedescription = h2020Programmes
.map(value -> new Tuple2<>(value.getCode(), value.getTitle())) .map(
value -> new Tuple2<>(value.getCode(),
new Tuple2<String, String>(value.getTitle(), value.getShortTitle())))
.collect() .collect()
.toArray(); .toArray();
for (int i = 0; i < codedescription.length - 1; i++) { for (int i = 0; i < codedescription.length - 1; i++) {
for (int j = i + 1; j < codedescription.length; j++) { for (int j = i + 1; j < codedescription.length; j++) {
Tuple2<String, String> t2i = (Tuple2<String, String>) codedescription[i]; Tuple2<String, Tuple2<String, String>> t2i = (Tuple2<String, Tuple2<String, String>>) codedescription[i];
Tuple2<String, String> t2j = (Tuple2<String, String>) codedescription[j]; Tuple2<String, Tuple2<String, String>> t2j = (Tuple2<String, Tuple2<String, String>>) codedescription[j];
if (t2i._1().compareTo(t2j._1()) > 0) { if (t2i._1().compareTo(t2j._1()) > 0) {
Tuple2<String, String> temp = t2i; Tuple2<String, Tuple2<String, String>> temp = t2i;
codedescription[i] = t2j; codedescription[i] = t2j;
codedescription[j] = temp; codedescription[j] = temp;
} }
} }
} }
Map<String, String> map = new HashMap<>(); Map<String, Tuple2<String, String>> map = new HashMap<>();
for (int j = 0; j < codedescription.length; j++) { for (int j = 0; j < codedescription.length; j++) {
Tuple2<String, String> entry = (Tuple2<String, String>) codedescription[j]; Tuple2<String, Tuple2<String, String>> entry = (Tuple2<String, Tuple2<String, String>>) codedescription[j];
String ent = entry._1(); String ent = entry._1();
if (ent.contains("Euratom-")) { if (ent.contains("Euratom-")) {
ent = ent.replace("-Euratom-", ".Euratom."); ent = ent.replace("-Euratom-", ".Euratom.");
} }
String[] tmp = ent.split("\\."); String[] tmp = ent.split("\\.");
if (tmp.length <= 2) { if (tmp.length <= 2) {
map.put(entry._1(), entry._2()); if (StringUtils.isEmpty(entry._2()._2())) {
map.put(entry._1(), new Tuple2<String, String>(entry._2()._1(), entry._2()._1()));
} else {
map.put(entry._1(), entry._2());
}
} else { } else {
if (ent.endsWith(".")) { if (ent.endsWith(".")) {
ent = ent.substring(0, ent.length() - 1); ent = ent.substring(0, ent.length() - 1);
@ -224,14 +237,14 @@ public class PrepareProgramme {
key = key.substring(0, key.length() - 1); key = key.substring(0, key.length() - 1);
} }
} }
String current = entry._2(); String current = entry._2()._1();
if (!ent.contains("Euratom")) { if (!ent.contains("Euratom")) {
String parent; String parent;
String tmp_key = tmp[0] + "."; String tmp_key = tmp[0] + ".";
for (int i = 1; i < tmp.length - 1; i++) { for (int i = 1; i < tmp.length - 1; i++) {
tmp_key += tmp[i] + "."; tmp_key += tmp[i] + ".";
parent = map.get(tmp_key).toLowerCase().trim(); parent = map.get(tmp_key)._1().toLowerCase().trim();
if (parent.contains("|")) { if (parent.contains("|")) {
parent = parent.substring(parent.lastIndexOf("|") + 1).trim(); parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
} }
@ -246,18 +259,29 @@ public class PrepareProgramme {
} }
} }
map.put(ent + ".", map.get(key) + " | " + current); String shortTitle = entry._2()._2();
if (StringUtils.isEmpty(shortTitle)) {
shortTitle = current;
}
Tuple2<String, String> newEntry = new Tuple2<>(map.get(key)._1() + " | " + current,
map.get(key)._2() + " | " + shortTitle);
map.put(ent + ".", newEntry);
} }
} }
h2020Programmes.foreach(csvProgramme -> { return h2020Programmes.map(csvProgramme -> {
if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
&& !csvProgramme.getCode().equals("H2020-EC")) String code = csvProgramme.getCode();
csvProgramme.setClassification(map.get(csvProgramme.getCode() + ".")); if (!code.endsWith(".") && !code.contains("Euratom")
else && !code.equals("H2020-EC"))
csvProgramme.setClassification(map.get(csvProgramme.getCode())); code += ".";
});
csvProgramme.setClassification(map.get(code)._1());
csvProgramme.setClassification_short(map.get(code)._2());
return csvProgramme;
}).collect();
} }
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(

View File

@ -9,7 +9,6 @@ import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -138,7 +137,8 @@ public class SparkAtomicActionJob {
pm.setCode(csvProject.getProgramme()); pm.setCode(csvProject.getProgramme());
h2020classification.setClassification(ocsvProgramme.get().getClassification()); h2020classification.setClassification(ocsvProgramme.get().getClassification());
h2020classification.setH2020Programme(pm); h2020classification.setH2020Programme(pm);
setLevelsAndProgramme(h2020classification, ocsvProgramme.get().getClassification()); setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short());
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
pp.setH2020classification(Arrays.asList(h2020classification)); pp.setH2020classification(Arrays.asList(h2020classification));
return pp; return pp;
@ -177,8 +177,8 @@ public class SparkAtomicActionJob {
} }
private static void setLevelsAndProgramme(H2020Classification h2020Classification, String classification) { private static void setLevelsandProgramme(H2020Classification h2020Classification, String classification_short) {
String[] tmp = classification.split(" \\| "); String[] tmp = classification_short.split(" \\| ");
h2020Classification.setLevel1(tmp[0]); h2020Classification.setLevel1(tmp[0]);
if (tmp.length > 1) { if (tmp.length > 1) {
h2020Classification.setLevel2(tmp[1]); h2020Classification.setLevel2(tmp[1]);
@ -189,6 +189,12 @@ public class SparkAtomicActionJob {
h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]); h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
} }
// private static void setProgramme(H2020Classification h2020Classification, String classification) {
// String[] tmp = classification.split(" \\| ");
//
// h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
// }
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) { SparkSession spark, String inputPath, Class<R> clazz) {
return spark return spark

View File

@ -22,6 +22,15 @@ public class CSVProgramme implements Serializable {
private String shortTitle; private String shortTitle;
private String language; private String language;
private String classification; private String classification;
private String classification_short;
public String getClassification_short() {
return classification_short;
}
public void setClassification_short(String classification_short) {
this.classification_short = classification_short;
}
public String getClassification() { public String getClassification() {
return classification; return classification;

View File

@ -9,12 +9,14 @@ import java.util.List;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException; import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
@Disabled
public class EXCELParserTest { public class EXCELParserTest {
private static Path workingDir; private static Path workingDir;

View File

@ -92,6 +92,8 @@ public class PrepareH2020ProgrammeTest {
Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count()); Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count());
// tmp.foreach(csvProgramme -> System.out.println(OBJECT_MAPPER.writeValueAsString(csvProgramme)));
Assertions Assertions
.assertEquals( .assertEquals(
"Societal challenges | Smart, Green And Integrated Transport | CLEANSKY2 | IADP Fast Rotorcraft", "Societal challenges | Smart, Green And Integrated Transport | CLEANSKY2 | IADP Fast Rotorcraft",

View File

@ -78,7 +78,7 @@ public class SparkUpdateProjectTest {
"-programmePath", "-programmePath",
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz") "/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz")
.getPath(), .getPath(),
"-projectPath", "-projectPath",
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json").getPath(), getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json").getPath(),
@ -124,7 +124,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Societal challenges", "Societal Challenges",
execverification execverification
.filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'")
.select("classification.level1") .select("classification.level1")
@ -133,7 +133,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Smart, Green And Integrated Transport", "Transport",
execverification execverification
.filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'")
.select("classification.level2") .select("classification.level2")
@ -188,7 +188,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Nurturing excellence by means of cross-border and cross-sector mobility", "MSCA Mobility",
execverification execverification
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
.select("classification.h2020Programme.description") .select("classification.h2020Programme.description")
@ -197,7 +197,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Excellent science", "Excellent Science",
execverification execverification
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
.select("classification.level1") .select("classification.level1")
@ -206,7 +206,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Marie Skłodowska-Curie Actions", "Marie-Sklodowska-Curie Actions",
execverification execverification
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
.select("classification.level2") .select("classification.level2")
@ -215,7 +215,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Nurturing excellence by means of cross-border and cross-sector mobility", "MSCA Mobility",
execverification execverification
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
.select("classification.level3") .select("classification.level3")

View File

@ -6,8 +6,10 @@ import org.apache.commons.logging.LogFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.ssl.SSLContextBuilder; import org.apache.http.ssl.SSLContextBuilder;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@Disabled
public class HttpConnectorTest { public class HttpConnectorTest {
private static final Log log = LogFactory.getLog(HttpConnectorTest.class); private static final Log log = LogFactory.getLog(HttpConnectorTest.class);

View File

@ -1,6 +1,6 @@
package eu.dnetlib.dhp.doiboost package eu.dnetlib.dhp.doiboost
import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.dhp.schema.oaf.{Publication, Relation}
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
@ -21,6 +21,13 @@ class QueryTest {
}
def has_ands(r:Relation) :Boolean = {
r.getCollectedfrom!= null && r.getCollectedfrom.asScala.count(k => k.getValue.contains("Australian")) > 0
} }
def hasInstanceWithUrl(p:Publication):Boolean = { def hasInstanceWithUrl(p:Publication):Boolean = {
@ -38,6 +45,8 @@ class QueryTest {
def myQuery(spark:SparkSession, sc:SparkContext): Unit = { def myQuery(spark:SparkSession, sc:SparkContext): Unit = {
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)

View File

@ -18,6 +18,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions { public class CleaningFunctions {
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NONE = "none";
public static <T extends Oaf> T fixVocabularyNames(T value) { public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
@ -106,6 +107,23 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getPid())) {
r
.setPid(
r
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(sp -> {
sp.setValue(StringUtils.trim(sp.getValue()));
return sp;
})
.collect(Collectors.toList()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r r
.setResourcetype( .setResourcetype(

View File

@ -11,6 +11,7 @@ import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump;
import java.io.*; import java.io.*;
import java.util.Optional;
import org.apache.commons.compress.archivers.ar.ArArchiveEntry; import org.apache.commons.compress.archivers.ar.ArArchiveEntry;
import org.apache.commons.compress.archivers.ar.ArArchiveOutputStream; import org.apache.commons.compress.archivers.ar.ArArchiveOutputStream;
@ -14,6 +15,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.MakeTarArchive;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
public class MakeTar implements Serializable { public class MakeTar implements Serializable {
@ -39,16 +41,22 @@ public class MakeTar implements Serializable {
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath); log.info("input path : {}", inputPath);
final int gBperSplit = Optional
.ofNullable(parser.get("splitSize"))
.map(Integer::valueOf)
.orElse(10);
Configuration conf = new Configuration(); Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf); FileSystem fileSystem = FileSystem.get(conf);
makeTArArchive(fileSystem, inputPath, outputPath); makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit);
} }
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException { public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit)
throws IOException {
RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath)); RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
@ -59,56 +67,9 @@ public class MakeTar implements Serializable {
String p_string = p.toString(); String p_string = p.toString();
String entity = p_string.substring(p_string.lastIndexOf("/") + 1); String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity); MakeTarArchive.tarMaxSize(fileSystem, p_string, outputPath + "/" + entity, entity, gBperSplit);
} }
} }
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(inputPath), true);
while (fileStatusListIterator.hasNext()) {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
entry.setSize(fileStatus.getLen());
ar.putArchiveEntry(entry);
InputStream is = fileSystem.open(fileStatus.getPath());
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte data[] = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}
bis.close();
ar.closeArchiveEntry();
}
}
ar.close();
}
} }

View File

@ -9,8 +9,10 @@ import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.dump.oaf.*; import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult; import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.dump.oaf.community.Context; import eu.dnetlib.dhp.schema.dump.oaf.community.Context;
import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal; import eu.dnetlib.dhp.schema.oaf.Journal;
@ -18,12 +20,12 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ResultMapper implements Serializable { public class ResultMapper implements Serializable {
public static <I extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map( public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
I in, Map<String, String> communityMap, boolean graph) { E in, Map<String, String> communityMap, boolean graph) {
Result out; Result out;
if (graph) { if (graph) {
out = new Result(); out = new GraphResult();
} else { } else {
out = new CommunityResult(); out = new CommunityResult();
} }
@ -154,7 +156,6 @@ public class ResultMapper implements Serializable {
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue()))); .ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList); out.setContributor(contributorList);
// List<Country> countryList = new ArrayList<>();
Optional Optional
.ofNullable(input.getCountry()) .ofNullable(input.getCountry())
.ifPresent( .ifPresent(
@ -186,8 +187,6 @@ public class ResultMapper implements Serializable {
.filter(Objects::nonNull) .filter(Objects::nonNull)
.collect(Collectors.toList()))); .collect(Collectors.toList())));
// out.setCountry(countryList);
final List<String> coverageList = new ArrayList<>(); final List<String> coverageList = new ArrayList<>();
Optional Optional
.ofNullable(input.getCoverage()) .ofNullable(input.getCoverage())
@ -214,15 +213,19 @@ public class ResultMapper implements Serializable {
out.setId(input.getId()); out.setId(input.getId());
out.setOriginalId(input.getOriginalId()); out.setOriginalId(input.getOriginalId());
final List<Instance> instanceList = new ArrayList<>(); Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
Optional .ofNullable(input.getInstance());
.ofNullable(input.getInstance())
.ifPresent( if (oInst.isPresent()) {
inst -> inst if (graph) {
.stream() ((GraphResult) out)
.forEach(i -> instanceList.add(getInstance(i, graph)))); .setInstance(oInst.get().stream().map(i -> getGraphInstance(i)).collect(Collectors.toList()));
out } else {
.setInstance(instanceList); ((CommunityResult) out)
.setInstance(
oInst.get().stream().map(i -> getCommunityInstance(i)).collect(Collectors.toList()));
}
}
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage()); Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
if (oL.isPresent()) { if (oL.isPresent()) {
@ -364,20 +367,34 @@ public class ResultMapper implements Serializable {
} }
private static Instance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i, boolean graph) { private static Instance getGraphInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
Instance instance = new Instance(); Instance instance = new Instance();
if (!graph) { setCommonValue(i, instance);
instance
.setCollectedfrom(
KeyValue
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
instance
.setHostedby(
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
}
return instance;
}
private static CommunityInstance getCommunityInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
CommunityInstance instance = new CommunityInstance();
setCommonValue(i, instance);
instance
.setCollectedfrom(
KeyValue
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
instance
.setHostedby(
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
return instance;
}
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
.ofNullable(i.getAccessright()); .ofNullable(i.getAccessright());
if (opAr.isPresent()) { if (opAr.isPresent()) {
@ -402,21 +419,17 @@ public class ResultMapper implements Serializable {
Optional Optional
.ofNullable(i.getRefereed()) .ofNullable(i.getRefereed())
.ifPresent(value -> instance.setRefereed(value.getClassname())); .ifPresent(value -> instance.setRefereed(value.getClassname()));
// .ifPresent(value -> instance.setRefereed(value.getValue()));
Optional Optional
.ofNullable(i.getInstancetype()) .ofNullable(i.getInstancetype())
.ifPresent(value -> instance.setType(value.getClassname())); .ifPresent(value -> instance.setType(value.getClassname()));
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value)); Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
return instance;
} }
private static List<Provenance> getUniqueProvenance(List<Provenance> provenance) { private static List<Provenance> getUniqueProvenance(List<Provenance> provenance) {
Provenance iProv = new Provenance(); Provenance iProv = new Provenance();
// iProv.setProvenance(Constants.INFERRED);
Provenance hProv = new Provenance(); Provenance hProv = new Provenance();
// hProv.setProvenance(Constants.HARVESTED);
Provenance lProv = new Provenance(); Provenance lProv = new Provenance();
for (Provenance p : provenance) { for (Provenance p : provenance) {

View File

@ -17,6 +17,10 @@ import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
public class SendToZenodoHDFS implements Serializable { public class SendToZenodoHDFS implements Serializable {
private final static String NEW = "new"; // to be used for a brand new deposition in zenodo
private final static String VERSION = "version"; // to be used to upload a new version of a published deposition
private final static String UPDATE = "update"; // to upload content to an open deposition not published
private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class); private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class);
public static void main(final String[] args) throws Exception, MissingConceptDoiException { public static void main(final String[] args) throws Exception, MissingConceptDoiException {
@ -34,10 +38,16 @@ public class SendToZenodoHDFS implements Serializable {
final String access_token = parser.get("accessToken"); final String access_token = parser.get("accessToken");
final String connection_url = parser.get("connectionUrl"); final String connection_url = parser.get("connectionUrl");
final String metadata = parser.get("metadata"); final String metadata = parser.get("metadata");
final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition")); final String depositionType = parser.get("depositionType");
final String concept_rec_id = Optional final String concept_rec_id = Optional
.ofNullable(parser.get("conceptRecordId")) .ofNullable(parser.get("conceptRecordId"))
.orElse(null); .orElse(null);
final Boolean publish = Optional
.ofNullable(parser.get("publish"))
.map(Boolean::valueOf)
.orElse(false);
final String depositionId = Optional.ofNullable(parser.get("depositionId")).orElse(null);
final String communityMapPath = parser.get("communityMapPath"); final String communityMapPath = parser.get("communityMapPath");
Configuration conf = new Configuration(); Configuration conf = new Configuration();
@ -51,13 +61,22 @@ public class SendToZenodoHDFS implements Serializable {
.listFiles( .listFiles(
new Path(hdfsPath), true); new Path(hdfsPath), true);
ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token); ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
if (newDeposition) { switch (depositionType) {
zenodoApiClient.newDeposition(); case NEW:
} else { zenodoApiClient.newDeposition();
if (concept_rec_id == null) { break;
throw new MissingConceptDoiException("No concept record id has been provided"); case VERSION:
} if (concept_rec_id == null) {
zenodoApiClient.newVersion(concept_rec_id); throw new MissingConceptDoiException("No concept record id has been provided");
}
zenodoApiClient.newVersion(concept_rec_id);
break;
case UPDATE:
if (depositionId == null) {
throw new MissingConceptDoiException("No deposition id has been provided");
}
zenodoApiClient.uploadOpenDeposition(depositionId);
break;
} }
while (fileStatusListIterator.hasNext()) { while (fileStatusListIterator.hasNext()) {
@ -79,9 +98,12 @@ public class SendToZenodoHDFS implements Serializable {
} }
} }
if (!metadata.equals("")) {
zenodoApiClient.sendMretadata(metadata);
}
zenodoApiClient.sendMretadata(metadata); if (publish)
zenodoApiClient.publish(); zenodoApiClient.publish();
} }

View File

@ -17,7 +17,7 @@ import com.google.gson.Gson;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.graph.Constants; import eu.dnetlib.dhp.oa.graph.dump.complete.Constants;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -70,4 +70,5 @@ public class Utils {
return new Gson().fromJson(sb.toString(), CommunityMap.class); return new Gson().fromJson(sb.toString(), CommunityMap.class);
} }
} }

View File

@ -135,12 +135,17 @@ public class SparkPrepareResultProject implements Serializable {
.orElse(null), .orElse(null),
Optional Optional
.ofNullable(op.getFundingtree()) .ofNullable(op.getFundingtree())
.map( .map(value -> {
value -> value List<Funder> tmp = value
.stream() .stream()
.map(ft -> getFunder(ft.getValue())) .map(ft -> getFunder(ft.getValue()))
.collect(Collectors.toList()) .collect(Collectors.toList());
.get(0)) if (tmp.size() > 0) {
return tmp.get(0);
} else {
return null;
}
})
.orElse(null)); .orElse(null));
Optional<DataInfo> di = Optional.ofNullable(op.getDataInfo()); Optional<DataInfo> di = Optional.ofNullable(op.getDataInfo());

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable; import java.io.Serializable;
@ -21,6 +21,7 @@ public class Constants implements Serializable {
public static final String CONTEXT_ID = "00"; public static final String CONTEXT_ID = "00";
public static final String CONTEXT_NS_PREFIX = "context_____"; public static final String CONTEXT_NS_PREFIX = "context_____";
public static final String UNKNOWN = "UNKNOWN";
// public static final String FUNDER_DS = "entityregistry::projects"; // public static final String FUNDER_DS = "entityregistry::projects";
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.IOException; import java.io.IOException;
@ -38,7 +38,7 @@ public class CreateContextEntities implements Serializable {
.toString( .toString(
CreateContextEntities.class CreateContextEntities.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_entity_parameter.json")); "/eu/dnetlib/dhp/oa/graph/dump/complete/input_entity_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.IOException; import java.io.IOException;
@ -44,7 +44,7 @@ public class CreateContextRelation implements Serializable {
.toString( .toString(
CreateContextRelation.class CreateContextRelation.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_entity_parameter.json")); "/eu/dnetlib/dhp/oa/graph/dump/complete/input_entity_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -48,7 +48,7 @@ public class DumpGraphEntities implements Serializable {
DumpProducts d = new DumpProducts(); DumpProducts d = new DumpProducts();
d d
.run( .run(
isSparkSessionManaged, inputPath, outputPath, communityMapPath, inputClazz, Result.class, isSparkSessionManaged, inputPath, outputPath, communityMapPath, inputClazz, GraphResult.class,
true); true);
break; break;
case "40": case "40":
@ -379,17 +379,16 @@ public class DumpGraphEntities implements Serializable {
} }
project project
.setH2020Classifications( .setH2020programme(
Optional Optional
.ofNullable(p.getH2020classification()) .ofNullable(p.getH2020classification())
.map( .map(
classification -> classification classification -> classification
.stream() .stream()
.map( .map(
c -> H2020Classification c -> Programme
.newInstance( .newInstance(
c.getH2020Programme().getCode(), c.getH2020Programme().getDescription(), c.getH2020Programme().getCode(), c.getH2020Programme().getDescription()))
c.getLevel1(), c.getLevel2(), c.getLevel3(), c.getClassification()))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(new ArrayList<>())); .orElse(new ArrayList<>()));
@ -488,7 +487,12 @@ public class DumpGraphEntities implements Serializable {
Optional Optional
.ofNullable(org.getCountry()) .ofNullable(org.getCountry())
.ifPresent( .ifPresent(
value -> organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname()))); value -> {
if (!value.getClassid().equals(Constants.UNKNOWN)) {
organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname()));
}
});
Optional Optional
.ofNullable(org.getId()) .ofNullable(org.getId())

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -23,13 +23,12 @@ import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
/** /**
* Creates new Relations (as in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation) from the information in the Entity. * Creates new Relations (as in eu.dnetlib.dhp.schema.dump.oaf.graph.Relation) from the information in the Entity. The
* The new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context * new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context related
* related to communities and research initiative/infrastructures. * to communities and research initiative/infrastructures. For collectedfrom elements it creates: datasource -> provides
* * -> result and result -> isProvidedBy -> datasource For hostedby elements it creates: datasource -> hosts -> result
* For collectedfrom elements it creates: datasource -> provides -> result and result -> isProvidedBy -> datasource * and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result. Note for
* For hostedby elements it creates: datasource -> hosts -> result and result -> isHostedBy -> datasource * context: it gets the first provenance in the dataInfo. If more than one is present the others are not dumped
* For context elements it creates: context <-> isRelatedTo <-> result
*/ */
public class Extractor implements Serializable { public class Extractor implements Serializable {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -16,9 +17,8 @@ import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*; import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
/** /**
* It process the ContextInfo information to produce a new Context Entity or a set of Relations between the * It process the ContextInfo information to produce a new Context Entity or a set of Relations between the generic
* generic context entity and datasource/projects related to the context. * context entity and datasource/projects related to the context.
*
*/ */
public class Process implements Serializable { public class Process implements Serializable {
private static final Logger log = LoggerFactory.getLogger(Process.class); private static final Logger log = LoggerFactory.getLogger(Process.class);
@ -39,7 +39,9 @@ public class Process implements Serializable {
ri.setDescription(ci.getDescription()); ri.setDescription(ci.getDescription());
ri.setName(ci.getName()); ri.setName(ci.getName());
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity()); if (StringUtils.isNotEmpty(ci.getZenodocommunity())) {
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
}
return (R) ri; return (R) ri;
} catch (final Exception e) { } catch (final Exception e) {

View File

@ -1,13 +1,9 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.StringReader; import java.io.StringReader;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.*; import java.util.*;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
@ -16,8 +12,6 @@ import org.dom4j.Node;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -8,6 +8,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -15,12 +16,11 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.Result; import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation; import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
/** /**
* Reads all the entities of the same type (Relation / Results) and saves them in the same folder * Reads all the entities of the same type (Relation / Results) and saves them in the same folder
*
*/ */
public class SparkCollectAndSave implements Serializable { public class SparkCollectAndSave implements Serializable {
@ -31,7 +31,7 @@ public class SparkCollectAndSave implements Serializable {
.toString( .toString(
SparkCollectAndSave.class SparkCollectAndSave.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_collect_and_save.json")); "/eu/dnetlib/dhp/oa/graph/dump/complete/input_collect_and_save.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
@ -48,6 +48,11 @@ public class SparkCollectAndSave implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final Boolean aggregateResult = Optional
.ofNullable(parser.get("resultAggregation"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
@ -55,22 +60,42 @@ public class SparkCollectAndSave implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Utils.removeOutputDir(spark, outputPath + "/result"); Utils.removeOutputDir(spark, outputPath + "/result");
run(spark, inputPath, outputPath); run(spark, inputPath, outputPath, aggregateResult);
}); });
} }
private static void run(SparkSession spark, String inputPath, String outputPath) { private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) {
Utils if (aggregate) {
.readPath(spark, inputPath + "/result/publication", Result.class) Utils
.union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class)) .readPath(spark, inputPath + "/result/publication", GraphResult.class)
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class)) .union(Utils.readPath(spark, inputPath + "/result/dataset", GraphResult.class))
.union(Utils.readPath(spark, inputPath + "/result/software", Result.class)) .union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class))
.write() .union(Utils.readPath(spark, inputPath + "/result/software", GraphResult.class))
.option("compression", "gzip") .write()
.mode(SaveMode.Overwrite) .option("compression", "gzip")
.json(outputPath + "/result"); .mode(SaveMode.Overwrite)
.json(outputPath + "/result");
} else {
write(
Utils
.readPath(spark, inputPath + "/result/publication", GraphResult.class),
outputPath + "/publication");
write(
Utils
.readPath(spark, inputPath + "/result/dataset", GraphResult.class),
outputPath + "/dataset");
write(
Utils
.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class),
outputPath + "/otheresearchproduct");
write(
Utils
.readPath(spark, inputPath + "/result/software", GraphResult.class),
outputPath + "/software");
}
Utils Utils
.readPath(spark, inputPath + "/relation/publication", Relation.class) .readPath(spark, inputPath + "/relation/publication", Relation.class)
@ -86,4 +111,12 @@ public class SparkCollectAndSave implements Serializable {
.json(outputPath + "/relation"); .json(outputPath + "/relation");
} }
private static void write(Dataset<GraphResult> dataSet, String outputPath) {
dataSet
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable; import java.io.Serializable;
import java.util.Optional; import java.util.Optional;
@ -22,7 +22,7 @@ public class SparkDumpEntitiesJob implements Serializable {
.toString( .toString(
SparkDumpEntitiesJob.class SparkDumpEntitiesJob.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/complete/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -8,6 +8,8 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
@ -20,6 +22,7 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance; import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Node; import eu.dnetlib.dhp.schema.dump.oaf.graph.Node;
import eu.dnetlib.dhp.schema.dump.oaf.graph.RelType; import eu.dnetlib.dhp.schema.dump.oaf.graph.RelType;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
/** /**
@ -34,7 +37,7 @@ public class SparkDumpRelationJob implements Serializable {
.toString( .toString(
SparkDumpRelationJob.class SparkDumpRelationJob.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_relationdump_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
@ -65,40 +68,54 @@ public class SparkDumpRelationJob implements Serializable {
} }
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath) { private static void dumpRelation(SparkSession spark, String inputPath, String outputPath) {
Utils Dataset<Relation> relations = Utils.readPath(spark, inputPath, Relation.class);
.readPath(spark, inputPath, Relation.class) relations
.map(relation -> { .map((MapFunction<Relation, eu.dnetlib.dhp.schema.dump.oaf.graph.Relation>) relation -> {
eu.dnetlib.dhp.schema.dump.oaf.graph.Relation rel = new eu.dnetlib.dhp.schema.dump.oaf.graph.Relation(); eu.dnetlib.dhp.schema.dump.oaf.graph.Relation rel_new = new eu.dnetlib.dhp.schema.dump.oaf.graph.Relation();
rel rel_new
.setSource( .setSource(
Node Node
.newInstance( .newInstance(
relation.getSource(), relation.getSource(),
ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)))); ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2))));
rel rel_new
.setTarget( .setTarget(
Node Node
.newInstance( .newInstance(
relation.getTarget(), relation.getTarget(),
ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)))); ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2))));
rel rel_new
.setReltype( .setReltype(
RelType RelType
.newInstance( .newInstance(
relation.getRelClass(), relation.getRelClass(),
relation.getSubRelType())); relation.getSubRelType()));
Optional Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
.ofNullable(relation.getDataInfo()) if (odInfo.isPresent()) {
.ifPresent( DataInfo dInfo = odInfo.get();
datainfo -> rel if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent()) {
.setProvenance( if (Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
Provenance rel_new
.newInstance(datainfo.getProvenanceaction().getClassname(), datainfo.getTrust()))); .setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
}
// Optional
// .ofNullable(relation.getDataInfo())
// .ifPresent(
// datainfo -> rel_new
// .setProvenance(
// Provenance
// .newInstance(datainfo.getProvenanceaction().getClassname(), datainfo.getTrust())));
return rel; return rel_new;
}, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class)) }, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
.write() .write()

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
@ -9,9 +9,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
/** /**

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -19,6 +19,7 @@ import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance; import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
@ -27,8 +28,8 @@ import eu.dnetlib.dhp.schema.dump.oaf.graph.RelType;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
/** /**
* Create new Relations between Context Entities and Organizations whose products are associated to the context. * Create new Relations between Context Entities and Organizations whose products are associated to the context. It
* It produces relation such as: organization <-> isRelatedTo <-> context * produces relation such as: organization <-> isRelatedTo <-> context
*/ */
public class SparkOrganizationRelation implements Serializable { public class SparkOrganizationRelation implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkOrganizationRelation.class); private static final Logger log = LoggerFactory.getLogger(SparkOrganizationRelation.class);
@ -38,7 +39,7 @@ public class SparkOrganizationRelation implements Serializable {
.toString( .toString(
SparkOrganizationRelation.class SparkOrganizationRelation.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_organization_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/complete/input_organization_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
@ -59,6 +60,9 @@ public class SparkOrganizationRelation implements Serializable {
.fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class); .fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class);
log.info("organization map : {}", new Gson().toJson(organizationMap)); log.info("organization map : {}", new Gson().toJson(organizationMap));
final String communityMapPath = parser.get("communityMapPath");
log.info("communityMapPath: {} ", communityMapPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
@ -66,14 +70,17 @@ public class SparkOrganizationRelation implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Utils.removeOutputDir(spark, outputPath); Utils.removeOutputDir(spark, outputPath);
extractRelation(spark, inputPath, organizationMap, outputPath); extractRelation(spark, inputPath, organizationMap, outputPath, communityMapPath);
}); });
} }
private static void extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap, private static void extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap,
String outputPath) { String outputPath, String communityMapPath) {
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class); Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class);
relationDataset.createOrReplaceTempView("relation"); relationDataset.createOrReplaceTempView("relation");
@ -97,32 +104,43 @@ public class SparkOrganizationRelation implements Serializable {
}, Encoders.bean(MergedRels.class)) }, Encoders.bean(MergedRels.class))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.collectAsList() .collectAsList()
.forEach(getMergedRelsConsumer(organizationMap, relList)); .forEach(getMergedRelsConsumer(organizationMap, relList, communityMap));
organizationMap organizationMap
.keySet() .keySet()
.forEach( .forEach(
oId -> organizationMap oId -> organizationMap
.get(oId) .get(oId)
.forEach(community -> addRelations(relList, community, oId))); .forEach(community -> {
if (communityMap.containsKey(community)) {
addRelations(relList, community, oId);
}
}));
// if (relList.size() > 0) {
spark spark
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class)) .createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
// }
} }
@NotNull @NotNull
private static Consumer<MergedRels> getMergedRelsConsumer(OrganizationMap organizationMap, private static Consumer<MergedRels> getMergedRelsConsumer(OrganizationMap organizationMap,
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList) { List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList, CommunityMap communityMap) {
return mergedRels -> { return mergedRels -> {
String oId = mergedRels.getOrganizationId(); String oId = mergedRels.getOrganizationId();
organizationMap organizationMap
.get(oId) .get(oId)
.forEach(community -> addRelations(relList, community, mergedRels.getRepresentativeId())); .forEach(community -> {
if (communityMap.containsKey(community)) {
addRelations(relList, community, mergedRels.getRepresentativeId());
}
});
organizationMap.remove(oId); organizationMap.remove(oId);
}; };
} }

View File

@ -0,0 +1,3 @@
package eu.dnetlib.dhp.sx.graph
case class IdReplace(newId:String, oldId:String) {}

View File

@ -1,13 +1,15 @@
package eu.dnetlib.dhp.sx.graph package eu.dnetlib.dhp.sx.graph
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation} import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
import eu.dnetlib.dhp.sx.ebi.EBIAggregator import eu.dnetlib.dhp.sx.ebi.EBIAggregator
import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import org.apache.spark.sql.functions.col
object SparkSplitOafTODLIEntities { object SparkSplitOafTODLIEntities {
@ -18,38 +20,38 @@ object SparkSplitOafTODLIEntities {
} }
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")))
val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass)
parser.parseArgument(args)
val workingPath: String = parser.get("workingPath") def extract_dataset(spark:SparkSession, workingPath:String) :Unit = {
logger.info(s"Working dir path = $workingPath")
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf].repartition(4000)
val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset].repartition(1000)
OAFDataset
.filter(s => s != null && s.isInstanceOf[DLIDataset])
.map(s =>s.asInstanceOf[DLIDataset])
.union(ebi_dataset)
.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
.map(p => p._2)
.repartition(2000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset")
}
def extract_publication(spark:SparkSession, workingPath:String) :Unit = {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
val spark:SparkSession = SparkSession
.builder()
.appName(SparkSplitOafTODLIEntities.getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master(parser.get("master"))
.getOrCreate()
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf] val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset] val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication].repartition(1000)
val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication]
val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation]
OAFDataset OAFDataset
@ -60,20 +62,17 @@ object SparkSplitOafTODLIEntities {
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIPublicationAggregator().toColumn) .agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
.map(p => p._2) .map(p => p._2)
.repartition(1000) .repartition(2000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/publication") .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/publication")
OAFDataset }
.filter(s => s != null && s.isInstanceOf[DLIDataset])
.map(s =>s.asInstanceOf[DLIDataset])
.union(ebi_dataset)
.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
.map(p => p._2)
.repartition(1000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset")
def extract_unknown(spark:SparkSession, workingPath:String) :Unit = {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
OAFDataset OAFDataset
.filter(s => s != null && s.isInstanceOf[DLIUnknown]) .filter(s => s != null && s.isInstanceOf[DLIUnknown])
@ -82,9 +81,46 @@ object SparkSplitOafTODLIEntities {
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIUnknownAggregator().toColumn) .agg(EBIAggregator.getDLIUnknownAggregator().toColumn)
.map(p => p._2) .map(p => p._2)
.repartition(1000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/unknown") .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/unknown")
}
def extract_ids(o:Oaf) :(String, String) = {
o match {
case p: DLIPublication =>
val prefix = StringUtils.substringBefore(p.getId, "|")
val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::")
(p.getId, s"$prefix|$original")
case p: DLIDataset =>
val prefix = StringUtils.substringBefore(p.getId, "|")
val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::")
(p.getId, s"$prefix|$original")
case _ =>null
}
}
def extract_relations(spark:SparkSession, workingPath:String) :Unit = {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000)
OAFDataset
.filter(o => o.isInstanceOf[Result])
.map(extract_ids)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.filter(r => r != null)
.where("_1 != _2")
.select(col("_1").alias("newId"), col("_2").alias("oldId"))
.distinct()
.map(f => IdReplace(f.getString(0), f.getString(1)))
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/id_replace")
OAFDataset OAFDataset
.filter(s => s != null && s.isInstanceOf[Relation]) .filter(s => s != null && s.isInstanceOf[Relation])
@ -94,10 +130,70 @@ object SparkSplitOafTODLIEntities {
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getRelationAggregator().toColumn) .agg(EBIAggregator.getRelationAggregator().toColumn)
.map(p => p._2) .map(p => p._2)
.repartition(1000) .repartition(4000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation") .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation_unfixed")
val relations = spark.read.load(s"$workingPath/graph/relation_unfixed").as[Relation]
val ids = spark.read.load(s"$workingPath/graph/id_replace").as[IdReplace]
relations
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
.joinWith(ids, col("_1").equalTo(ids("oldId")), "left")
.map(i =>{
val r = i._1._2
if (i._2 != null)
{
val id = i._2.newId
r.setSource(id)
}
r
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/rel_f_source")
val rel_source:Dataset[Relation] = spark.read.load(s"$workingPath/graph/rel_f_source").as[Relation]
rel_source
.map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING, relEncoder))
.joinWith(ids, col("_1").equalTo(ids("oldId")), "left")
.map(i =>{
val r:Relation = i._1._2
if (i._2 != null)
{
val id = i._2.newId
r.setTarget(id)
}
r
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")))
val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass)
parser.parseArgument(args)
val workingPath: String = parser.get("workingPath")
val entity:String = parser.get("entity")
logger.info(s"Working dir path = $workingPath")
val spark:SparkSession = SparkSession
.builder()
.appName(SparkSplitOafTODLIEntities.getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master(parser.get("master"))
.getOrCreate()
entity match {
case "publication" => extract_publication(spark, workingPath)
case "dataset" => extract_dataset(spark,workingPath)
case "relation" => extract_relations(spark, workingPath)
case "unknown" => extract_unknown(spark, workingPath)
}

View File

@ -1,18 +1,18 @@
<workflow-app name="dump_community_products" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="dump_community_products" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
<description>the source path</description> <description>the source path</description>
</property> </property>
<property> <property>
<name>isLookUpUrl</name> <name>isLookUpUrl</name>
<description>the isLookup service endpoint</description> <description>the isLookup service endpoint</description>
</property> </property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<description>the output path</description> <description>the output path</description>
</property> </property>
<property> <property>
<name>accessToken</name> <name>accessToken</name>
<description>the access token used for the deposition in Zenodo</description> <description>the access token used for the deposition in Zenodo</description>
@ -320,6 +320,7 @@
<ok to="join_extend"/> <ok to="join_extend"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="extend_orp"> <action name="extend_orp">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -344,6 +345,7 @@
<ok to="join_extend"/> <ok to="join_extend"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="extend_software"> <action name="extend_software">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -371,43 +373,42 @@
<join name="join_extend" to="splitForCommunities"/> <join name="join_extend" to="splitForCommunities"/>
<action name="splitForCommunities"> <action name="splitForCommunities">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Split dumped result for community</name> <name>Split dumped result for community</name>
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkSplitForCommunity</class> <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkSplitForCommunity</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/ext</arg> <arg>--sourcePath</arg><arg>${workingDir}/ext</arg>
<arg>--outputPath</arg><arg>${workingDir}/split</arg> <arg>--outputPath</arg><arg>${workingDir}/split</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
</spark> </spark>
<ok to="make_archive"/> <ok to="make_archive"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="make_archive"> <action name="make_archive">
<java> <java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class> <main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>
<arg>--hdfsPath</arg><arg>${outputPath}</arg> <arg>--hdfsPath</arg><arg>${outputPath}</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg> <arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${workingDir}/split</arg> <arg>--sourcePath</arg><arg>${workingDir}/split</arg>
</java> </java>
<ok to="send_zenodo"/> <ok to="send_zenodo"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="send_zenodo"> <action name="send_zenodo">
<java> <java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class> <main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
@ -418,14 +419,12 @@
<arg>--metadata</arg><arg>${metadata}</arg> <arg>--metadata</arg><arg>${metadata}</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg> <arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
<arg>--newDeposition</arg><arg>${newDeposition}</arg> <arg>--depositionType</arg><arg>${depositionType}</arg>
</java> </java>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -17,7 +17,13 @@
"paramLongName": "isSparkSessionManaged", "paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise", "paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false "paramRequired": false
} },
{
"paramName": "ra",
"paramLongName": "resultAggregation",
"paramDescription": "true if all the result type should be saved under the generic result name. false to get a different dump for each result type",
"paramRequired": true
}
] ]

View File

@ -23,6 +23,12 @@
"paramLongName": "isSparkSessionManaged", "paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise", "paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false "paramRequired": false
},
{
"paramName":"cmp",
"paramLongName":"communityMapPath",
"paramDescription": "the path to the serialization of the community map",
"paramRequired": true
} }
] ]

View File

@ -1,18 +1,22 @@
<workflow-app name="dump_whole_graph" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="dump_whole_graph" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
<description>the source path</description> <description>the source path</description>
</property> </property>
<property> <property>
<name>isLookUpUrl</name> <name>isLookUpUrl</name>
<description>the isLookup service endpoint</description> <description>the isLookup service endpoint</description>
</property> </property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<description>the output path</description> <description>the output path</description>
</property> </property>
<property>
<name>resultAggregation</name>
<description>true if all the result type have to be dumped under result. false otherwise</description>
</property>
<property> <property>
<name>accessToken</name> <name>accessToken</name>
<description>the access token used for the deposition in Zenodo</description> <description>the access token used for the deposition in Zenodo</description>
@ -26,13 +30,17 @@
<description> the metadata associated to the deposition</description> <description> the metadata associated to the deposition</description>
</property> </property>
<property> <property>
<name>newDeposition</name> <name>depositionType</name>
<description>true if it is a brand new depositon. false for new version of an old deposition</description> <description>the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided)</description>
</property> </property>
<property> <property>
<name>conceptRecordId</name> <name>conceptRecordId</name>
<description>for new version, the id of the record for the old deposition</description> <description>for new version, the id of the record for the old deposition</description>
</property> </property>
<property>
<name>depositionId</name>
<description>the depositionId of a deposition open that has to be added content</description>
</property>
<property> <property>
<name>organizationCommunityMap</name> <name>organizationCommunityMap</name>
<description>the organization community map</description> <description>the organization community map</description>
@ -148,7 +156,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table publication </name> <name>Dump table publication </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -174,7 +182,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table dataset </name> <name>Dump table dataset </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -200,7 +208,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table ORP </name> <name>Dump table ORP </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -226,7 +234,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table software </name> <name>Dump table software </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -252,7 +260,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table organization </name> <name>Dump table organization </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -268,7 +276,6 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--outputPath</arg><arg>${workingDir}/collect/organization</arg> <arg>--outputPath</arg><arg>${workingDir}/collect/organization</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<!-- <arg>&#45;&#45;isLookUpUrl</arg><arg>${isLookUpUrl}</arg>-->
</spark> </spark>
<ok to="join_dump"/> <ok to="join_dump"/>
<error to="Kill"/> <error to="Kill"/>
@ -279,7 +286,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table project </name> <name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -295,7 +302,6 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--outputPath</arg><arg>${workingDir}/collect/project</arg> <arg>--outputPath</arg><arg>${workingDir}/collect/project</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<!-- <arg>&#45;&#45;isLookUpUrl</arg><arg>${isLookUpUrl}</arg>-->
</spark> </spark>
<ok to="join_dump"/> <ok to="join_dump"/>
<error to="Kill"/> <error to="Kill"/>
@ -306,7 +312,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table datasource </name> <name>Dump table datasource </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -322,7 +328,6 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--outputPath</arg><arg>${workingDir}/collect/datasource</arg> <arg>--outputPath</arg><arg>${workingDir}/collect/datasource</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<!-- <arg>&#45;&#45;isLookUpUrl</arg><arg>${isLookUpUrl}</arg>-->
</spark> </spark>
<ok to="join_dump"/> <ok to="join_dump"/>
<error to="Kill"/> <error to="Kill"/>
@ -333,7 +338,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table relation </name> <name>Dump table relation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpRelationJob</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpRelationJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -352,10 +357,8 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="join_dump" to="fork_context"/> <join name="join_dump" to="fork_context"/>
<fork name="fork_context"> <fork name="fork_context">
<path start="create_entities_fromcontext"/> <path start="create_entities_fromcontext"/>
<path start="create_relation_fromcontext"/> <path start="create_relation_fromcontext"/>
@ -364,8 +367,8 @@
<action name="create_entities_fromcontext"> <action name="create_entities_fromcontext">
<java> <java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.graph.CreateContextEntities</main-class> <main-class>eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextEntities</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/collect/context</arg> <arg>--hdfsPath</arg><arg>${workingDir}/collect/communities_infrastructures</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg> <arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</java> </java>
@ -375,7 +378,7 @@
<action name="create_relation_fromcontext"> <action name="create_relation_fromcontext">
<java> <java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.graph.CreateContextRelation</main-class> <main-class>eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextRelation</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/relation/context</arg> <arg>--hdfsPath</arg><arg>${workingDir}/relation/context</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg> <arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
@ -384,13 +387,12 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="create_relation_fromorgs"> <action name="create_relation_fromorgs">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table relation </name> <name>Dump table relation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkOrganizationRelation</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkOrganizationRelation</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -405,6 +407,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg> <arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/contextOrg</arg> <arg>--outputPath</arg><arg>${workingDir}/relation/contextOrg</arg>
<arg>--organizationCommunityMap</arg><arg>${organizationCommunityMap}</arg> <arg>--organizationCommunityMap</arg><arg>${organizationCommunityMap}</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
</spark> </spark>
<ok to="join_context"/> <ok to="join_context"/>
<error to="Kill"/> <error to="Kill"/>
@ -412,7 +415,6 @@
<join name="join_context" to="fork_extract_relations"/> <join name="join_context" to="fork_extract_relations"/>
<fork name="fork_extract_relations"> <fork name="fork_extract_relations">
<path start="rels_from_pubs"/> <path start="rels_from_pubs"/>
<path start="rels_from_dats"/> <path start="rels_from_dats"/>
@ -425,7 +427,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Extract Relations from publication </name> <name>Extract Relations from publication </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkExtractRelationFromEntities</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -441,7 +443,6 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/publication</arg> <arg>--outputPath</arg><arg>${workingDir}/relation/publication</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<!-- <arg>&#45;&#45;isLookUpUrl</arg><arg>${isLookUpUrl}</arg>-->
</spark> </spark>
<ok to="join_extract_relations"/> <ok to="join_extract_relations"/>
<error to="Kill"/> <error to="Kill"/>
@ -452,7 +453,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table dataset </name> <name>Dump table dataset </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkExtractRelationFromEntities</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -468,7 +469,6 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/dataset</arg> <arg>--outputPath</arg><arg>${workingDir}/relation/dataset</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<!-- <arg>&#45;&#45;isLookUpUrl</arg><arg>${isLookUpUrl}</arg>-->
</spark> </spark>
<ok to="join_extract_relations"/> <ok to="join_extract_relations"/>
<error to="Kill"/> <error to="Kill"/>
@ -479,7 +479,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table ORP </name> <name>Dump table ORP </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkExtractRelationFromEntities</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -495,7 +495,6 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/orp</arg> <arg>--outputPath</arg><arg>${workingDir}/relation/orp</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<!-- <arg>&#45;&#45;isLookUpUrl</arg><arg>${isLookUpUrl}</arg>-->
</spark> </spark>
<ok to="join_extract_relations"/> <ok to="join_extract_relations"/>
<error to="Kill"/> <error to="Kill"/>
@ -506,7 +505,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Dump table software </name> <name>Dump table software </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkExtractRelationFromEntities</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -522,13 +521,11 @@
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation/software</arg> <arg>--outputPath</arg><arg>${workingDir}/relation/software</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<!-- <arg>&#45;&#45;isLookUpUrl</arg><arg>${isLookUpUrl}</arg>-->
</spark> </spark>
<ok to="join_extract_relations"/> <ok to="join_extract_relations"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="join_extract_relations" to="collect_and_save"/> <join name="join_extract_relations" to="collect_and_save"/>
<action name="collect_and_save"> <action name="collect_and_save">
@ -536,7 +533,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Collect Results and Relations and put them in the right path </name> <name>Collect Results and Relations and put them in the right path </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkCollectAndSave</class> <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkCollectAndSave</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -550,6 +547,7 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}</arg> <arg>--sourcePath</arg><arg>${workingDir}</arg>
<arg>--outputPath</arg><arg>${workingDir}/collect</arg> <arg>--outputPath</arg><arg>${workingDir}/collect</arg>
<arg>--resultAggregation</arg><arg>${resultAggregation}</arg>
</spark> </spark>
<ok to="make_archive"/> <ok to="make_archive"/>
<error to="Kill"/> <error to="Kill"/>
@ -565,8 +563,7 @@
<ok to="send_zenodo"/> <ok to="send_zenodo"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="send_zenodo"> <action name="send_zenodo">
<java> <java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class> <main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
@ -577,7 +574,8 @@
<arg>--metadata</arg><arg>${metadata}</arg> <arg>--metadata</arg><arg>${metadata}</arg>
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg> <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg> <arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
<arg>--newDeposition</arg><arg>${newDeposition}</arg> <arg>--depositionType</arg><arg>${depositionType}</arg>
<arg>--depositionId</arg><arg>${depositionId}</arg>
</java> </java>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -0,0 +1,37 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"description": {
"type": "string",
"description": "Description of the research community/research infrastructure"
},
"id": {
"type": "string",
"description": "OpenAIRE id of the research community/research infrastructure"
},
"name": {
"type": "string",
"description": "The long name of the community"
},
"originalId": {
"type": "string",
"description": "The acronym of the community"
},
"subject": {
"description": "Only for research communities: the list of the subjects associated to the research community",
"type": "array",
"items": {
"type": "string"
}
},
"type": {
"type": "string",
"description": "One of {Research Community, Research infrastructure}"
},
"zenodo_community": {
"type": "string",
"description": "The URL of the Zenodo community associated to the Research community/Research infrastructure"
}
}
}

View File

@ -0,0 +1,192 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"ControlledField": {
"type": "object",
"properties": {
"scheme": {
"type": "string"
},
"value": {
"type": "string"
}
},
"description": "To represent the information described by a scheme and a value in that scheme (i.e. pid)"
}
},
"type": "object",
"properties": {
"accessrights": {
"type": "string",
"description": "Type of access to the data source, as defined by re3data.org. Possible values: {open, restricted, closed}"
},
"certificates": {
"type": "string",
"description": "The certificate, seal or standard the data source complies with. As defined by re3data.org."
},
"citationguidelineurl": {
"type": "string",
"description":"The URL of the data source providing information on how to cite its items. As defined by re3data.org."
},
"contenttypes": {
"description": "Types of content in the data source, as defined by OpenDOAR",
"type": "array",
"items": {
"type": "string"
}
},
"databaseaccessrestriction": {
"type": "string",
"description": "Access restrinctions to the data source, as defined by re3data.org. One of {feeRequired, registration, other}"
},
"datasourcetype": {
"allOf": [
{
"$ref": "#/definitions/ControlledField"
},
{
"description": "The type of the datasource. See https://api.openaire.eu/vocabularies/dnet:datasource_typologies"
}
]
},
"datauploadrestriction": {
"type": "string",
"description": "Upload restrictions applied by the datasource, as defined by re3data.org. One of {feeRequired, registration, other}"
},
"dateofvalidation": {
"type": "string",
"description": "The date of last validation against the OpenAIRE guidelines for the datasource records"
},
"description": {
"type": "string"
},
"englishname": {
"type": "string",
"description": "The English name of the datasource"
},
"id": {
"type": "string",
"description": "The OpenAIRE id of the data source"
},
"journal": {
"type": "object",
"properties": {
"conferencedate": {
"type": "string"
},
"conferenceplace": {
"type": "string"
},
"edition": {
"type": "string"
},
"ep": {
"type": "string",
"description": "End page"
},
"iss": {
"type": "string",
"description": "Issue number"
},
"issnLinking": {
"type": "string"
},
"issnOnline": {
"type": "string"
},
"issnPrinted": {
"type": "string"
},
"name": {
"type": "string"
},
"sp": {
"type": "string",
"description": "Start page"
},
"vol": {
"type": "string",
"description": "Volume"
}
},
"description": "Information about the journal, if this data source is of type Journal."
},
"languages": {
"description": "The languages present in the data source's content, as defined by OpenDOAR.",
"type": "array",
"items": {
"type": "string"
}
},
"logourl": {
"type": "string"
},
"missionstatementurl": {
"type": "string",
"description":"The URL of a mission statement describing the designated community of the data source. As defined by re3data.org"
},
"officialname": {
"type": "string",
"description": "The official name of the datasource"
},
"openairecompatibility": {
"type": "string",
"description": "OpenAIRE guidelines the data source comply with. See also https://guidelines.openaire.eu."
},
"originalId": {
"description": "Original identifiers for the datasource"
"type": "array",
"items": {
"type": "string"
}
},
"pid": {
"description": "Persistent identifiers of the datasource",
"type": "array",
"items": {
"allOf": [
{
"$ref": "#/definitions/ControlledField"
}
]
}
},
"pidsystems": {
"type": "string",
"description": "The persistent identifier system that is used by the data source. As defined by re3data.org"
},
"policies": {
"description": "Policies of the data source, as defined in OpenDOAR.",
"type": "array",
"items": {
"type": "string"
}
},
"releaseenddate": {
"type": "string",
"description": "Date when the data source went offline or stopped ingesting new research data. As defined by re3data.org"
},
"releasestartdate": {
"type": "string",
"description": "Releasing date of the data source, as defined by re3data.org"
},
"subjects": {
"description": "List of subjects associated to the datasource",
"type": "array",
"items": {
"type": "string"
}
},
"uploadrights": {
"type": "string",
"description": "Type of data upload. As defined by re3data.org: one of {open, restricted,closed}"
},
"versioning": {
"type": "boolean",
"description": "As defined by redata.org: 'yes' if the data source supports versioning, 'no' otherwise."
},
"websiteurl": {
"type": "string"
}
}
}

View File

@ -3,11 +3,10 @@
"type": "object", "type": "object",
"properties": { "properties": {
"alternativenames": { "alternativenames": {
"description": "Description of alternativenames", "description": "Alternative names that identify the organisation",
"type": "array", "type": "array",
"items": { "items": {
"type": "string", "type": "string"
"description": "Description of alternativenames"
} }
}, },
"country": { "country": {
@ -15,48 +14,44 @@
"properties": { "properties": {
"code": { "code": {
"type": "string", "type": "string",
"description": "Description of code" "description": "The organisation country code"
}, },
"label": { "label": {
"type": "string", "type": "string",
"description": "Description of label" "description": "The organisation country label"
} }
}, },
"description": "Description of country" "description": "The country of the organisation"
}, },
"id": { "id": {
"type": "string", "type": "string",
"description": "Description of id" "description": "The OpenAIRE id for the organisation"
}, },
"legalname": { "legalname": {
"type": "string", "type": "string"
"description": "Description of legalname"
}, },
"legalshortname": { "legalshortname": {
"type": "string", "type": "string"
"description": "Description of legalshortname"
}, },
"pid": { "pid": {
"description": "Description of pid", "description": "Persistent identifiers for the organisation i.e. isni 0000000090326370",
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "type": "object",
"properties": { "properties": {
"scheme": { "scheme": {
"type": "string", "type": "string",
"description": "Description of scheme" "description": "The scheme of the identifier (i.e. isni)"
}, },
"value": { "value": {
"type": "string", "type": "string",
"description": "Description of value" "description": "the value in the schema (i.e. 0000000090326370)"
} }
}, }
"description": "Description of pid"
} }
}, },
"websiteurl": { "websiteurl": {
"type": "string", "type": "string"
"description": "Description of websiteurl"
} }
} }
} }

View File

@ -0,0 +1,119 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"acronym": {
"type": "string"
},
"callidentifier": {
"type": "string"
},
"code": {
"type": "string",
"description": "The grant agreement number"
},
"enddate": {
"type": "string"
},
"funding": {
"description": "Funding information for the project",
"type": "array",
"items": {
"type": "object",
"properties": {
"funding_stream": {
"type": "object",
"properties": {
"description": {
"type": "string",
"description": "Description of the funding stream"
},
"id": {
"type": "string",
"description": "Id of the funding stream"
}
}
},
"jurisdiction": {
"type": "string",
"description": "The jurisdiction of the funder (i.e. EU)"
},
"name": {
"type": "string",
"description": "The name of the funder (European Commission)"
},
"shortName": {
"type": "string",
"description": "The short name of the funder (EC)"
}
}
}
},
"granted": {
"type": "object",
"properties": {
"currency": {
"type": "string",
"description": "The currency of the granted amount (e.g. EUR)"
},
"fundedamount": {
"type": "number",
"description": "The funded amount"
},
"totalcost": {
"type": "number",
"description": "The total cost of the project"
}
},
"description": "The money granted to the project"
},
"h2020programme": {
"description": "The h2020 programme funding the project",
"type": "array",
"items": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "The code of the programme"
},
"description": {
"type": "string",
"description": "The description of the programme"
}
}
}
},
"id": {
"type": "string",
"description": "OpenAIRE id for the project"
},
"keywords": {
"type": "string"
},
"openaccessmandatefordataset": {
"type": "boolean"
},
"openaccessmandateforpublications": {
"type": "boolean"
},
"startdate": {
"type": "string"
},
"subject": {
"type": "array",
"items": {
"type": "string"
}
},
"summary": {
"type": "string"
},
"title": {
"type": "string"
},
"websiteurl": {
"type": "string"
}
}
}

View File

@ -0,0 +1,60 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"Node": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The OpenAIRE id of the entity"
},
"type": {
"type": "string",
"description": "The type of the entity (i.e. organisation)"
}
}
}
},
"type": "object",
"properties": {
"provenance": {
"type": "object",
"properties": {
"provenance": {
"type": "string",
"description": "The reason why OpenAIRE holds the relation "
},
"trust": {
"type": "string",
"description": "The trust of the relation in the range of [0,1]. Where greater the number, more the trust. Harvested relationships have typically a high trust (0.9). The trust of inferred relationship is calculated by the inference algorithm that generated them, as described in https://graph.openaire.eu/about#architecture (Enrichment --> Mining)"
}
}
},
"reltype": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The semantics of the relation (i.e. isAuthorInstitutionOf). "
},
"type": {
"type": "string",
"description": "the type of the relation (i.e. affiliation)"
}
},
"description": "To represent the semantics of a relation between two entities"
},
"source": {
"allOf": [
{"$ref": "#/definitions/Node"},
{"description": "The node source in the relation"}
]
},
"target": {
"allOf": [
{"$ref": "#/definitions/Node"},
{"description": "The node target in the relation"}
]
}
}
}

View File

@ -0,0 +1,330 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"ControlledField": {
"type": "object",
"properties": {
"scheme": {
"type": "string"
},
"value": {
"type": "string"
}
},
"description": "To represent the information described by a scheme and a value in that scheme (i.e. pid)"
},
"Provenance": {
"type": "object",
"properties": {
"provenance": {
"type": "string",
"description": "The process that produced/provided the information"
},
"trust": {
"type": "string"
}
},
"description": "Indicates the process that produced (or provided) the information, and the trust associated to the information"
}
},
"type": "object",
"properties": {
"author": {
"type": "array",
"items": {
"type": "object",
"properties": {
"fullname": {
"type": "string"
},
"name": {
"type": "string"
},
"pid": {
"type": "object",
"properties": {
"id": {
"allOf": [
{"$ref": "#/definitions/ControlledField"},
{"description": "The author's id and scheme. OpenAIRE currently supports 'ORCID'"}
]
},
"provenance": {
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Provenance of author's pid"}
]
}
}
},
"rank": {
"type": "integer"
},
"surname": {
"type": "string"
}
}
}
},
"bestaccessright": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/"
},
"label": {
"type": "string",
"description": "Label for the access mode"
},
"scheme": {
"type": "string",
"description": "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/"
}
},
"description": "The openest access right associated to the manifestations of this research results"
},
"codeRepositoryUrl": {
"type": "string",
"description": "Only for results with type 'software': the URL to the repository with the source code"
},
"contactgroup": {
"description": "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource",
"type": "array",
"items": {
"type": "string"
}
},
"contactperson": {
"description": "Only for results with type 'software': Information on the person responsible for providing further information regarding the resource",
"type": "array",
"items": {
"type": "string"
}
},
"container": {
"type": "object",
"properties": {
"conferencedate": {
"type": "string"
},
"conferenceplace": {
"type": "string"
},
"edition": {
"type": "string",
"description": "Edition of the journal or conference proceeding"
},
"ep": {
"type": "string",
"description": "End page"
},
"iss": {
"type": "string",
"description": "Journal issue"
},
"issnLinking": {
"type": "string"
},
"issnOnline": {
"type": "string"
},
"issnPrinted": {
"type": "string"
},
"name": {
"type": "string",
"description": "Name of the journal or conference"
},
"sp": {
"type": "string",
"description": "start page"
},
"vol": {
"type": "string"
}
},
"description": "Container has information about the conference or journal where the result has been presented or published"
},
"contributor": {
"type": "array",
"items": {
"type": "string",
"description": "Description of contributor"
}
},
"country": {
"type": "array",
"items": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "ISO 3166-1 alpha-2 country code"
},
"label": {
"type": "string"
},
"provenance": {
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Why this result is associated to the country."}
]
}
}
}
},
"coverage": {
"type": "array",
"items": {
"type": "string"
}
},
"dateofcollection": {
"type": "string",
"description": "When OpenAIRE collected the record the last time"
},
"description": {
"type": "array",
"items": {
"type": "string"
}
},
"documentationUrl": {
"description": "Only for results with type 'software': URL to the software documentation",
"type": "array",
"items": {
"type": "string"
}
},
"embargoenddate": {
"type": "string",
"description": "Date when the embargo ends and this result turns Open Access"
},
"format": {
"type": "array",
"items": {
"type": "string"
}
},
"geolocation": {
"description": "Geolocation information",
"type": "array",
"items": {
"type": "object",
"properties": {
"box": {
"type": "string"
},
"place": {
"type": "string"
},
"point": {
"type": "string"
}
}
}
},
"id": {
"type": "string",
"description": "OpenAIRE Identifier"
},
"language": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "alpha-3/ISO 639-2 code of the language"
},
"label": {
"type": "string",
"description": "English label"
}
}
},
"lastupdatetimestamp": {
"type": "integer",
"description": "Timestamp of last update of the record in OpenAIRE"
},
"maintitle": {
"type": "string"
},
"originalId": {
"description": "Identifiers of the record at the original sources",
"type": "array",
"items": {
"type": "string"
}
},
"pid": {
"description": "Persistent identifiers of the result",
"type": "array",
"items": {
"allOf": [
{"$ref": "#/definitions/ControlledField"},
{"description": "scheme: list of available schemes are at https://api.openaire.eu/vocabularies/dnet:pid_types, value: the PID of the result "}
]
}
},
"programmingLanguage": {
"type": "string",
"description": "Only for results with type 'software': the programming language"
},
"publicationdate": {
"type": "string"
},
"publisher": {
"type": "string"
},
"size": {
"type": "string",
"description": "Only for results with type 'dataset': the declared size of the dataset"
},
"source": {
"description": "See definition of Dublin Core field dc:source",
"type": "array",
"items": {
"type": "string"
}
},
"subjects": {
"description": "Keywords associated to the result",
"type": "array",
"items": {
"type": "object",
"properties": {
"provenance": {
"allOf": [
{"$ref": "#/definitions/Provenance"},
{"description": "Why this subject is associated to the result"}
]
},
"subject": {
"allOf": [
{"$ref": "#/definitions/ControlledField"},
{"description": "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies) and value. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)."},
]
}
}
}
},
"subtitle": {
"type": "string"
},
"tool": {
"description": "Only for results with type 'other': tool useful for the interpretation and/or re-used of the research product",
"type": "array",
"items": {
"type": "string"
}
},
"type": {
"type": "string",
"description": "Type of the result: one of 'publication', 'dataset', 'software', 'other' (see also https://api.openaire.eu/vocabularies/dnet:result_typologies)"
},
"version": {
"type": "string",
"description": "Version of the result"
}
}
}

View File

@ -17,6 +17,12 @@
"paramLongName":"nameNode", "paramLongName":"nameNode",
"paramDescription": "the name node", "paramDescription": "the name node",
"paramRequired": true "paramRequired": true
},
{
"paramName":"ss",
"paramLongName":"splitSize",
"paramDescription": "the maximum size of the archive",
"paramRequired": false
} }
] ]

View File

@ -1,9 +1,9 @@
[ [
{ {
"paramName":"nd", "paramName":"dt",
"paramLongName":"newDeposition", "paramLongName":"depositionType",
"paramDescription": "if it is a new deposition (true) or a new versione (false)", "paramDescription": "the type of the deposition (new, version, update)",
"paramRequired": true "paramRequired": true
}, },
{ {
@ -18,6 +18,12 @@
"paramDescription": "the path to the serialization of the community map", "paramDescription": "the path to the serialization of the community map",
"paramRequired": false "paramRequired": false
}, },
{
"paramName":"di",
"paramLongName":"depositionId",
"paramDescription": "the id of an open deposition which has not been published",
"paramRequired": false
},
{ {
"paramName":"hdfsp", "paramName":"hdfsp",
"paramLongName":"hdfsPath", "paramLongName":"hdfsPath",
@ -47,5 +53,11 @@
"paramLongName":"metadata", "paramLongName":"metadata",
"paramDescription": "metadata associated to the deposition", "paramDescription": "metadata associated to the deposition",
"paramRequired": false "paramRequired": false
} },
{
"paramName":"p",
"paramLongName":"publish",
"paramDescription": "if to publish the upload",
"paramRequired": false
}
] ]

View File

@ -1,38 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"description": {
"type": "string",
"description": "Description of description"
},
"id": {
"type": "string",
"description": "Description of id"
},
"name": {
"type": "string",
"description": "Description of name"
},
"originalId": {
"type": "string",
"description": "Description of originalId"
},
"subject": {
"description": "Description of subject",
"type": "array",
"items": {
"type": "string",
"description": "Description of subject"
}
},
"type": {
"type": "string",
"description": "Description of type"
},
"zenodo_community": {
"type": "string",
"description": "Description of zenodo_community"
}
}
}

View File

@ -1,210 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"ControlledField": {
"type": "object",
"properties": {
"scheme": {
"type": "string",
"description": "Description of scheme"
},
"value": {
"type": "string",
"description": "Description of value"
}
}
}
},
"type": "object",
"properties": {
"accessrights": {
"type": "string",
"description": "Description of accessrights"
},
"certificates": {
"type": "string",
"description": "Description of certificates"
},
"citationguidelineurl": {
"type": "string",
"description": "Description of citationguidelineurl"
},
"contenttypes": {
"description": "Description of contenttypes",
"type": "array",
"items": {
"type": "string",
"description": "Description of contenttypes"
}
},
"databaseaccessrestriction": {
"type": "string",
"description": "Description of databaseaccessrestriction"
},
"datasourcetype": {
"allOf": [
{
"$ref": "#/definitions/ControlledField"
},
{
"description": "Description of datasourcetype"
}
]
},
"datauploadrestriction": {
"type": "string",
"description": "Description of datauploadrestriction"
},
"dateofvalidation": {
"type": "string",
"description": "Description of dateofvalidation"
},
"description": {
"type": "string",
"description": "Description of description"
},
"englishname": {
"type": "string",
"description": "Description of englishname"
},
"id": {
"type": "string",
"description": "Description of id"
},
"journal": {
"type": "object",
"properties": {
"conferencedate": {
"type": "string",
"description": "Description of conferencedate"
},
"conferenceplace": {
"type": "string",
"description": "Description of conferenceplace"
},
"edition": {
"type": "string",
"description": "Description of edition"
},
"ep": {
"type": "string",
"description": "Description of ep"
},
"iss": {
"type": "string",
"description": "Description of iss"
},
"issnLinking": {
"type": "string",
"description": "Description of issnLinking"
},
"issnOnline": {
"type": "string",
"description": "Description of issnOnline"
},
"issnPrinted": {
"type": "string",
"description": "Description of issnPrinted"
},
"name": {
"type": "string",
"description": "Description of name"
},
"sp": {
"type": "string",
"description": "Description of sp"
},
"vol": {
"type": "string",
"description": "Description of vol"
}
},
"description": "Description of journal"
},
"languages": {
"description": "Description of languages",
"type": "array",
"items": {
"type": "string",
"description": "Description of languages"
}
},
"logourl": {
"type": "string",
"description": "Description of logourl"
},
"missionstatementurl": {
"type": "string",
"description": "Description of missionstatementurl"
},
"officialname": {
"type": "string",
"description": "Description of officialname"
},
"openairecompatibility": {
"type": "string",
"description": "Description of openairecompatibility"
},
"originalId": {
"description": "Description of originalId",
"type": "array",
"items": {
"type": "string",
"description": "Description of originalId"
}
},
"pid": {
"description": "Description of pid",
"type": "array",
"items": {
"allOf": [
{
"$ref": "#/definitions/ControlledField"
},
{
"description": "Description of pid"
}
]
}
},
"pidsystems": {
"type": "string",
"description": "Description of pidsystems"
},
"policies": {
"description": "Description of policies",
"type": "array",
"items": {
"description": "Description of policies"
}
},
"releaseenddate": {
"type": "string",
"description": "Description of releaseenddate"
},
"releasestartdate": {
"type": "string",
"description": "Description of releasestartdate"
},
"subjects": {
"description": "Description of subjects",
"type": "array",
"items": {
"type": "string",
"description": "Description of subjects"
}
},
"uploadrights": {
"type": "string",
"description": "Description of uploadrights"
},
"versioning": {
"type": "boolean",
"description": "Description of versioning"
},
"websiteurl": {
"type": "string",
"description": "Description of websiteurl"
}
}
}

View File

@ -1,134 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"acronym": {
"type": "string",
"description": "Description of acronym"
},
"callidentifier": {
"type": "string",
"description": "Description of callidentifier"
},
"code": {
"type": "string",
"description": "Description of code"
},
"enddate": {
"type": "string",
"description": "Description of enddate"
},
"funding": {
"description": "Description of funding",
"type": "array",
"items": {
"type": "object",
"properties": {
"funding_stream": {
"type": "object",
"properties": {
"description": {
"type": "string",
"description": "Description of description"
},
"id": {
"type": "string",
"description": "Description of id"
}
},
"description": "Description of funding_stream"
},
"jurisdiction": {
"type": "string",
"description": "Description of jurisdiction"
},
"name": {
"type": "string",
"description": "Description of name"
},
"shortName": {
"type": "string",
"description": "Description of shortName"
}
},
"description": "Description of funding"
}
},
"granted": {
"type": "object",
"properties": {
"currency": {
"type": "string",
"description": "Description of currency"
},
"fundedamount": {
"type": "number",
"description": "Description of fundedamount"
},
"totalcost": {
"type": "number",
"description": "Description of totalcost"
}
},
"description": "Description of granted"
},
"id": {
"type": "string",
"description": "Description of id"
},
"keywords": {
"type": "string",
"description": "Description of keywords"
},
"openaccessmandatefordataset": {
"type": "boolean",
"description": "Description of openaccessmandatefordataset"
},
"openaccessmandateforpublications": {
"type": "boolean",
"description": "Description of openaccessmandateforpublications"
},
"programme": {
"description": "Description of programme",
"type": "array",
"items": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "Description of code"
},
"description": {
"type": "string",
"description": "Description of description"
}
},
"description": "Description of programme"
}
},
"startdate": {
"type": "string",
"description": "Description of startdate"
},
"subject": {
"description": "Description of subject",
"type": "array",
"items": {
"type": "string",
"description": "Description of subject"
}
},
"summary": {
"type": "string",
"description": "Description of summary"
},
"title": {
"type": "string",
"description": "Description of title"
},
"websiteurl": {
"type": "string",
"description": "Description of websiteurl"
}
}
}

View File

@ -1,69 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"Node": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "Description of id"
},
"type": {
"type": "string",
"description": "Description of type"
}
}
}
},
"type": "object",
"properties": {
"provenance": {
"type": "object",
"properties": {
"provenance": {
"type": "string",
"description": "Description of provenance"
},
"trust": {
"type": "string",
"description": "Description of trust"
}
},
"description": "Description of provenance"
},
"reltype": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Description of name"
},
"type": {
"type": "string",
"description": "Description of type"
}
},
"description": "Description of reltype"
},
"source": {
"allOf": [
{
"$ref": "#/definitions/Node"
},
{
"description": "Description of source"
}
]
},
"target": {
"allOf": [
{
"$ref": "#/definitions/Node"
},
{
"description": "Description of target"
}
]
}
}
}

View File

@ -1,520 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"definitions": {
"AccessRight": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "Description of code"
},
"label": {
"type": "string",
"description": "Description of label"
},
"scheme": {
"type": "string",
"description": "Description of scheme"
}
}
},
"ControlledField": {
"type": "object",
"properties": {
"scheme": {
"type": "string",
"description": "Description of scheme"
},
"value": {
"type": "string",
"description": "Description of value"
}
}
},
"KeyValue": {
"type": "object",
"properties": {
"key": {
"type": "string",
"description": "Description of key"
},
"value": {
"type": "string",
"description": "Description of value"
}
}
},
"Provenance": {
"type": "object",
"properties": {
"provenance": {
"type": "string",
"description": "Description of provenance"
},
"trust": {
"type": "string",
"description": "Description of trust"
}
}
}
},
"type": "object",
"properties": {
"author": {
"description": "Description of author",
"type": "array",
"items": {
"type": "object",
"properties": {
"affiliation": {
"description": "Description of affiliation",
"type": "array",
"items": {
"type": "string",
"description": "Description of affiliation"
}
},
"fullname": {
"type": "string",
"description": "Description of fullname"
},
"name": {
"type": "string",
"description": "Description of name"
},
"pid": {
"type": "object",
"properties": {
"id": {
"allOf": [
{
"$ref": "#/definitions/ControlledField"
},
{
"description": "Description of id"
}
]
},
"provenance": {
"allOf": [
{
"$ref": "#/definitions/Provenance"
},
{
"description": "Description of provenance"
}
]
}
},
"description": "Description of pid"
},
"rank": {
"type": "integer",
"description": "Description of rank"
},
"surname": {
"type": "string",
"description": "Description of surname"
}
},
"description": "Description of author"
}
},
"bestaccessright": {
"allOf": [
{
"$ref": "#/definitions/AccessRight"
},
{
"description": "Description of bestaccessright"
}
]
},
"codeRepositoryUrl": {
"type": "string",
"description": "Description of codeRepositoryUrl"
},
"contactgroup": {
"description": "Description of contactgroup",
"type": "array",
"items": {
"type": "string",
"description": "Description of contactgroup"
}
},
"contactperson": {
"description": "Description of contactperson",
"type": "array",
"items": {
"type": "string",
"description": "Description of contactperson"
}
},
"container": {
"type": "object",
"properties": {
"conferencedate": {
"type": "string",
"description": "Description of conferencedate"
},
"conferenceplace": {
"type": "string",
"description": "Description of conferenceplace"
},
"edition": {
"type": "string",
"description": "Description of edition"
},
"ep": {
"type": "string",
"description": "Description of ep"
},
"iss": {
"type": "string",
"description": "Description of iss"
},
"issnLinking": {
"type": "string",
"description": "Description of issnLinking"
},
"issnOnline": {
"type": "string",
"description": "Description of issnOnline"
},
"issnPrinted": {
"type": "string",
"description": "Description of issnPrinted"
},
"name": {
"type": "string",
"description": "Description of name"
},
"sp": {
"type": "string",
"description": "Description of sp"
},
"vol": {
"type": "string",
"description": "Description of vol"
}
},
"description": "Description of container"
},
"contributor": {
"description": "Description of contributor",
"type": "array",
"items": {
"type": "string",
"description": "Description of contributor"
}
},
"country": {
"description": "Description of country",
"type": "array",
"items": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "Description of code"
},
"label": {
"type": "string",
"description": "Description of label"
},
"provenance": {
"allOf": [
{
"$ref": "#/definitions/Provenance"
},
{
"description": "Description of provenance"
}
]
}
},
"description": "Description of country"
}
},
"coverage": {
"description": "Description of coverage",
"type": "array",
"items": {
"type": "string",
"description": "Description of coverage"
}
},
"dateofcollection": {
"type": "string",
"description": "Description of dateofcollection"
},
"description": {
"description": "Description of description",
"type": "array",
"items": {
"type": "string",
"description": "Description of description"
}
},
"documentationUrl": {
"description": "Description of documentationUrl",
"type": "array",
"items": {
"type": "string",
"description": "Description of documentationUrl"
}
},
"embargoenddate": {
"type": "string",
"description": "Description of embargoenddate"
},
"externalReference": {
"description": "Description of externalReference",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Description of name"
},
"provenance": {
"allOf": [
{
"$ref": "#/definitions/Provenance"
},
{
"description": "Description of provenance"
}
]
},
"typology": {
"type": "string",
"description": "Description of typology"
},
"value": {
"type": "string",
"description": "Description of value"
}
},
"description": "Description of externalReference"
}
},
"format": {
"description": "Description of format",
"type": "array",
"items": {
"type": "string",
"description": "Description of format"
}
},
"geolocation": {
"description": "Description of geolocation",
"type": "array",
"items": {
"type": "object",
"properties": {
"box": {
"type": "string",
"description": "Description of box"
},
"place": {
"type": "string",
"description": "Description of place"
},
"point": {
"type": "string",
"description": "Description of point"
}
},
"description": "Description of geolocation"
}
},
"id": {
"type": "string",
"description": "Description of id"
},
"instance": {
"description": "Description of instance",
"type": "array",
"items": {
"type": "object",
"properties": {
"accessright": {
"allOf": [
{
"$ref": "#/definitions/AccessRight"
},
{
"description": "Description of accessright"
}
]
},
"collectedfrom": {
"allOf": [
{
"$ref": "#/definitions/KeyValue"
},
{
"description": "Description of collectedfrom"
}
]
},
"hostedby": {
"allOf": [
{
"$ref": "#/definitions/KeyValue"
},
{
"description": "Description of hostedby"
}
]
},
"license": {
"type": "string",
"description": "Description of license"
},
"publicationdate": {
"type": "string",
"description": "Description of publicationdate"
},
"refereed": {
"type": "string",
"description": "Description of refereed"
},
"type": {
"type": "string",
"description": "Description of type"
},
"url": {
"description": "Description of url",
"type": "array",
"items": {
"type": "string",
"description": "Description of url"
}
}
},
"description": "Description of instance"
}
},
"language": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "Description of code"
},
"label": {
"type": "string",
"description": "Description of label"
}
},
"description": "Description of language"
},
"lastupdatetimestamp": {
"type": "integer",
"description": "Description of lastupdatetimestamp"
},
"maintitle": {
"type": "string",
"description": "Description of maintitle"
},
"originalId": {
"description": "Description of originalId",
"type": "array",
"items": {
"type": "string",
"description": "Description of originalId"
}
},
"pid": {
"description": "Description of pid",
"type": "array",
"items": {
"allOf": [
{
"$ref": "#/definitions/ControlledField"
},
{
"description": "Description of pid"
}
]
}
},
"programmingLanguage": {
"type": "string",
"description": "Description of programmingLanguage"
},
"publicationdate": {
"type": "string",
"description": "Description of publicationdate"
},
"publisher": {
"type": "string",
"description": "Description of publisher"
},
"size": {
"type": "string",
"description": "Description of size"
},
"source": {
"description": "Description of source",
"type": "array",
"items": {
"type": "string",
"description": "Description of source"
}
},
"subjects": {
"description": "Description of subjects",
"type": "array",
"items": {
"type": "object",
"properties": {
"provenance": {
"allOf": [
{
"$ref": "#/definitions/Provenance"
},
{
"description": "Description of provenance"
}
]
},
"subject": {
"allOf": [
{
"$ref": "#/definitions/ControlledField"
},
{
"description": "Description of subject"
}
]
}
},
"description": "Description of subjects"
}
},
"subtitle": {
"type": "string",
"description": "Description of subtitle"
},
"tool": {
"description": "Description of tool",
"type": "array",
"items": {
"type": "string",
"description": "Description of tool"
}
},
"type": {
"type": "string",
"description": "Description of type"
},
"version": {
"type": "string",
"description": "Description of version"
}
}
}

View File

@ -1,4 +1,5 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true} {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true},
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the work dir path", "paramRequired": true}
] ]

View File

@ -14,30 +14,103 @@
</property> </property>
</parameters> </parameters>
<start to="ExtractDLIEntities"/> <start to="ExtractDLIPublication"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="ExtractDLIEntities"> <action name="ExtractDLIPublication">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Extract DLI Entities</name> <name>Extract DLI Entities (Publication)</name>
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class> <class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory ${sparkExecutorMemory} --executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=5000
${sparkExtraOPT} ${sparkExtraOPT}
</spark-opts> </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg> <arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>-e</arg><arg>publication</arg>
</spark>
<ok to="ExtractDLIDataset"/>
<error to="Kill"/>
</action>
<action name="ExtractDLIDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Extract DLI Entities (Dataset)</name>
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=5000
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>-e</arg><arg>dataset</arg>
</spark>
<ok to="ExtractDLIUnknown"/>
<error to="Kill"/>
</action>
<action name="ExtractDLIUnknown">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Extract DLI Entities (Unknown)</name>
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=5000
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>-e</arg><arg>unknown</arg>
</spark>
<ok to="ExtractDLIRelation"/>
<error to="Kill"/>
</action>
<action name="ExtractDLIRelation">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Extract DLI Entities (Relation)</name>
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=5000
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>-e</arg><arg>relation</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -8,9 +8,11 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
@ -23,12 +25,13 @@ import com.google.gson.Gson;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.dump.oaf.Result; import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult; import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
@Disabled //@Disabled
public class DumpJobTest { public class DumpJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -165,6 +168,10 @@ public class DumpJobTest {
Assertions.assertEquals(90, verificationDataset.count()); Assertions.assertEquals(90, verificationDataset.count());
// verificationDataset
// .filter("id = '50|DansKnawCris::1a960e20087cb46b93588e4e184e8a58'")
// .foreach((ForeachFunction<CommunityResult>) rec -> System.out.println(OBJECT_MAPPER.writeValueAsString(rec)));
Assertions Assertions
.assertTrue( .assertTrue(
verificationDataset.filter("bestAccessright.code = 'c_abf2'").count() == verificationDataset verificationDataset.filter("bestAccessright.code = 'c_abf2'").count() == verificationDataset
@ -213,20 +220,21 @@ public class DumpJobTest {
.run( .run(
// false, sourcePath, workingDir.toString() + "/result", communityMapPath, Dataset.class, // false, sourcePath, workingDir.toString() + "/result", communityMapPath, Dataset.class,
false, sourcePath, workingDir.toString() + "/result", communityMapPath, Dataset.class, false, sourcePath, workingDir.toString() + "/result", communityMapPath, Dataset.class,
Result.class, true); GraphResult.class, true);
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<eu.dnetlib.dhp.schema.dump.oaf.Result> tmp = sc JavaRDD<eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult> tmp = sc
.textFile(workingDir.toString() + "/result") .textFile(workingDir.toString() + "/result")
.map(item -> OBJECT_MAPPER.readValue(item, eu.dnetlib.dhp.schema.dump.oaf.Result.class)); .map(item -> OBJECT_MAPPER.readValue(item, eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult.class));
org.apache.spark.sql.Dataset<eu.dnetlib.dhp.schema.dump.oaf.Result> verificationDataset = spark org.apache.spark.sql.Dataset<eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.Result.class)); .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult.class));
Assertions.assertEquals(5, verificationDataset.count()); Assertions.assertEquals(5, verificationDataset.count());
verificationDataset.show(false); verificationDataset
.foreach((ForeachFunction<GraphResult>) res -> System.out.println(OBJECT_MAPPER.writeValueAsString(res)));
} }
@Test @Test

View File

@ -54,7 +54,7 @@ public class MakeTarTest {
String inputPath = workingDir + "/zenodo/"; String inputPath = workingDir + "/zenodo/";
MakeTar.makeTArArchive(fs, inputPath, "/tmp/out"); MakeTar.makeTArArchive(fs, inputPath, "/tmp/out", 0);
} }
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
@ -7,7 +7,6 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.function.Consumer; import java.util.function.Consumer;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.util.*; import java.util.*;
import java.util.function.Consumer; import java.util.function.Consumer;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -10,6 +10,7 @@ import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterAll;
@ -71,7 +72,7 @@ public class DumpOrganizationProjectDatasourceTest {
public void dumpOrganizationTest() throws Exception { public void dumpOrganizationTest() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/graph/organization") .getResource("/eu/dnetlib/dhp/oa/graph/dump/complete/organization")
.getPath(); .getPath();
DumpGraphEntities dg = new DumpGraphEntities(); DumpGraphEntities dg = new DumpGraphEntities();
@ -89,7 +90,10 @@ public class DumpOrganizationProjectDatasourceTest {
Assertions.assertEquals(34, verificationDataset.count()); Assertions.assertEquals(34, verificationDataset.count());
verificationDataset.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); verificationDataset
.foreach(
(ForeachFunction<eu.dnetlib.dhp.schema.dump.oaf.graph.Organization>) o -> System.out
.println(OBJECT_MAPPER.writeValueAsString(o)));
} }
@ -97,7 +101,7 @@ public class DumpOrganizationProjectDatasourceTest {
public void dumpProjectTest() { public void dumpProjectTest() {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/graph/project") .getResource("/eu/dnetlib/dhp/oa/graph/dump/complete/project")
.getPath(); .getPath();
DumpGraphEntities dg = new DumpGraphEntities(); DumpGraphEntities dg = new DumpGraphEntities();
@ -115,14 +119,17 @@ public class DumpOrganizationProjectDatasourceTest {
Assertions.assertEquals(12, verificationDataset.count()); Assertions.assertEquals(12, verificationDataset.count());
verificationDataset.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); verificationDataset
.foreach(
(ForeachFunction<eu.dnetlib.dhp.schema.dump.oaf.graph.Project>) o -> System.out
.println(OBJECT_MAPPER.writeValueAsString(o)));
} }
@Test @Test
public void dumpDatasourceTest() { public void dumpDatasourceTest() {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/graph/datasource") .getResource("/eu/dnetlib/dhp/oa/graph/dump/complete/datasource")
.getPath(); .getPath();
DumpGraphEntities dg = new DumpGraphEntities(); DumpGraphEntities dg = new DumpGraphEntities();
@ -140,7 +147,10 @@ public class DumpOrganizationProjectDatasourceTest {
Assertions.assertEquals(5, verificationDataset.count()); Assertions.assertEquals(5, verificationDataset.count());
verificationDataset.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); verificationDataset
.foreach(
(ForeachFunction<eu.dnetlib.dhp.schema.dump.oaf.graph.Datasource>) o -> System.out
.println(OBJECT_MAPPER.writeValueAsString(o)));
} }
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;

View File

@ -1,31 +1,23 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.HashMap;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation; import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class ExtractRelationFromEntityTest { public class ExtractRelationFromEntityTest {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph; package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -7,13 +7,10 @@ import java.nio.file.Path;
import java.util.HashMap; import java.util.HashMap;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.neethi.Assertion;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@ -24,9 +21,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation; import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
public class RelationFromOrganizationTest { public class RelationFromOrganizationTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -77,14 +72,19 @@ public class RelationFromOrganizationTest {
public void test1() throws Exception { public void test1() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/graph/relation") .getResource("/eu/dnetlib/dhp/oa/graph/dump/relation")
.getPath();
final String communityMapPath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymapservices.json")
.getPath(); .getPath();
SparkOrganizationRelation.main(new String[] { SparkOrganizationRelation.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/relation", "-outputPath", workingDir.toString() + "/relation",
"-sourcePath", sourcePath, "-sourcePath", sourcePath,
"-organizationCommunityMap", organizationCommunityMap "-organizationCommunityMap", organizationCommunityMap,
"-communityMapPath", communityMapPath
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -98,23 +98,24 @@ public class RelationFromOrganizationTest {
verificationDataset.createOrReplaceTempView("table"); verificationDataset.createOrReplaceTempView("table");
Assertions.assertEquals(170, verificationDataset.count()); // Assertions.assertEquals(170, verificationDataset.count());
Assertions.assertEquals(0, verificationDataset.count());
Dataset<Row> checkDs = spark // Dataset<Row> checkDs = spark
.sql( // .sql(
"Select source.id, source.type " + // "Select source.id, source.type " +
"from table "); // "from table ");
//
Assertions.assertEquals(2, checkDs.filter("substr(id, 4, 5) = 'dedup' ").count()); // Assertions.assertEquals(2, checkDs.filter("substr(id, 4, 5) = 'dedup' ").count());
//
Assertions.assertEquals(0, checkDs.filter("id = '20|grid________::afaa39865943381c51f76c08725ffa75'").count()); // Assertions.assertEquals(0, checkDs.filter("id = '20|grid________::afaa39865943381c51f76c08725ffa75'").count());
//
Assertions.assertEquals(25, checkDs.filter("id = '00|context_____::" + DHPUtils.md5("beopen") + "'").count()); // Assertions.assertEquals(25, checkDs.filter("id = '00|context_____::" + DHPUtils.md5("beopen") + "'").count());
//
Assertions // Assertions
.assertEquals(30, checkDs.filter("id = '00|context_____::" + DHPUtils.md5("euromarine") + "'").count()); // .assertEquals(30, checkDs.filter("id = '00|context_____::" + DHPUtils.md5("euromarine") + "'").count());
//
Assertions.assertEquals(30, checkDs.filter("id = '00|context_____::" + DHPUtils.md5("mes") + "'").count()); // Assertions.assertEquals(30, checkDs.filter("id = '00|context_____::" + DHPUtils.md5("mes") + "'").count());
} }
} }

View File

@ -12,6 +12,7 @@ import com.fasterxml.jackson.databind.SerializationFeature;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser;
import eu.dnetlib.scholexplorer.relation.RelationMapper; import eu.dnetlib.scholexplorer.relation.RelationMapper;
public class ScholexplorerParserTest { public class ScholexplorerParserTest {
@ -37,4 +38,26 @@ public class ScholexplorerParserTest {
} }
}); });
} }
@Test
public void testPublicationParser() throws Exception {
String xml = IOUtils.toString(this.getClass().getResourceAsStream("pmf.xml"));
PublicationScholexplorerParser p = new PublicationScholexplorerParser();
List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
ObjectMapper m = new ObjectMapper();
m.enable(SerializationFeature.INDENT_OUTPUT);
oaves
.forEach(
oaf -> {
try {
System.out.println(m.writeValueAsString(oaf));
System.out.println("----------------------------");
} catch (JsonProcessingException e) {
}
});
}
} }

View File

@ -30,7 +30,7 @@ class SparkScholexplorerAggregationTest {
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").getOrCreate() val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication] val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication]

View File

@ -0,0 +1 @@
{"egi":"EGI Federation","covid-19":"COVID-19","rda":"Research Data Alliance","ni":"Neuroinformatics","dh-ch":"Digital Humanities and Cultural Heritage"}

View File

@ -1,51 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/" <oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns="http://namespace.openaire.eu/">
xmlns:dri="http://www.driver-repository.eu/namespace/dri" <oai:header xmlns="">
xmlns:dc="http://purl.org/dc/elements/1.1/"> <dri:objIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464::0002882a9d38c4f4612e7666ad768ccd</dri:objIdentifier>
<oai:header> <dri:recordIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">https://research.jcu.edu.au/researchdata/published/detail/9079e05370d830eb8d416c77c0b761ce::url</dri:recordIdentifier>
<dri:repositoryId>aaadf8b3-01a8-4cc2-9964-63cfb19df3b4_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId> <dri:dateOfCollection xmlns:dri="http://www.driver-repository.eu/namespace/dri">2020-11-02T16:14:07.831Z</dri:dateOfCollection>
<dri:recordIdentifier>oai:pangaea.de:doi:10.1594/PANGAEA.432865</dri:recordIdentifier> <dri:repositoryId xmlns:dri="http://www.driver-repository.eu/namespace/dri">ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
<dri:datasourceprefix>r3d100010134</dri:datasourceprefix> <dri:datasourceprefix xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464</dri:datasourceprefix>
<dri:objIdentifier>r3d100010134::00002f60593fd1f758fb838fafb46795</dri:objIdentifier>
<dri:dateOfCollection>2020-02-18T03:05:02.534Z</dri:dateOfCollection>
<oaf:datasourceprefix/>
<identifier>oai:pangaea.de:doi:10.1594/PANGAEA.432865</identifier>
<setSpec>citable topicOceans</setSpec>
</oai:header> </oai:header>
<oai:metadata> <metadata xmlns="">
<resource xmlns="http://datacite.org/schema/kernel-3"> <resource xmlns="http://datacite.org/schema/kernel-3"
<identifier identifierType="doi">10.1594/pangaea.432865</identifier> xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
<titles xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
<title>Daily sea level from coastal tide gauge station Woods_Hole in 1978 (Research quality database)</title> <identifier xmlns="" identifierType="url">https://research.jcu.edu.au/researchdata/published/detail/9079e05370d830eb8d416c77c0b761ce</identifier>
<titles xmlns="">
<title>Vertebrate monitoring in the Australian Wet Tropics rainforest at CU6A1 (145.30367623, -16.57767628, 600.0m above MSL) collected by Reptile Surveys</title>
</titles> </titles>
<publisher xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">PANGAEA - Data Publisher for Earth &amp; Environmental Science</publisher> <publisher xmlns="">James Cook University</publisher>
<publicationYear xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">2006</publicationYear> <dates xmlns="">
<dates xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <date dateType="Collected">2013-05-07</date>
<date dateType="Collected">1978-01-01T12:00:00/1978-12-31T12:00:00</date>
</dates> </dates>
<creators xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <creators xmlns=""/>
<creator> <resourceType xmlns="" resourceTypeGeneral="Dataset">Dataset</resourceType>
<creatorName>WOCE Sea Level, WSL</creatorName> <relatedIdentifiers xmlns="">
</creator> <relatedIdentifier entityType="publication" inverseRelationType="related"
</creators> relatedIdentifierType="dnet"
<subjects xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> relationType="IsRelatedTo">r3d100010464::57793c5aa995172db237d9da17353f8b</relatedIdentifier>
<subject subjectScheme="Parameter">DATE/TIME</subject>
<subject subjectScheme="Parameter">Sea level</subject>
<subject subjectScheme="Method">Tide gauge station</subject>
<subject subjectScheme="Campaign">SeaLevel</subject>
<subject subjectScheme="Project">World Ocean Circulation Experiment (WOCE)</subject>
</subjects>
<resourceType resourceTypeGeneral="Dataset"/>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="URL" relationType="isDocumentedBy"
inverseRelationType="documents">http://store.pangaea.de/Projects/WOCE/SeaLevel_rqds/Woods_Hole.txt</relatedIdentifier>
</relatedIdentifiers> </relatedIdentifiers>
</resource> </resource>
</oai:metadata> </metadata>
<oaf:about> <oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf" xmlns="">
<oaf:datainfo> <oaf:datainfo>
<oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010134" name="Pangaea"/> <oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010464"
name="Australian National Data Service"/>
<oaf:completionStatus>complete</oaf:completionStatus> <oaf:completionStatus>complete</oaf:completionStatus>
<oaf:provisionMode>collected</oaf:provisionMode> <oaf:provisionMode>collected</oaf:provisionMode>
</oaf:datainfo> </oaf:datainfo>

View File

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns="http://namespace.openaire.eu/">
<oai:header xmlns="">
<dri:objIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464::57793c5aa995172db237d9da17353f8b</dri:objIdentifier>
<dri:recordIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">10.1111/j.1365-2486.2005.00995.x::doi</dri:recordIdentifier>
<dri:dateOfCollection xmlns:dri="http://www.driver-repository.eu/namespace/dri">2020-11-02T16:14:07.831Z</dri:dateOfCollection>
<dri:repositoryId xmlns:dri="http://www.driver-repository.eu/namespace/dri">ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
<dri:datasourceprefix xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464</dri:datasourceprefix>
</oai:header>
<metadata xmlns="">
<oaf:pid xmlns:oaf="http://namespace.dnet.eu/oaf" type="doi">10.1111/j.1365-2486.2005.00995.x</oaf:pid>
<dc:identifier xmlns:dc="http://purl.org/dc/elements/1.1/">10.1111/j.1365-2486.2005.00995.x</dc:identifier>
<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/">Potential decoupling of trends in distribution area and population size of species with climate change.</dc:title>
<dc:type xmlns:dc="http://purl.org/dc/elements/1.1/">publication</dc:type>
</metadata>
<oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf" xmlns="">
<oaf:datainfo>
<oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010464"
name="Australian National Data Service"/>
<oaf:completionStatus>complete</oaf:completionStatus>
<oaf:provisionMode>collected</oaf:provisionMode>
</oaf:datainfo>
</oaf:about>
</oai:record>

Some files were not shown because too many files have changed in this diff Show More