Compare commits
No commits in common. "main" and "fos_l1l2" have entirely different histories.
|
@ -7,12 +7,12 @@ import java.sql.*;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.slf4j.Logger;
|
import org.apache.commons.logging.Log;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
public class DbClient implements Closeable {
|
public class DbClient implements Closeable {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(DbClient.class);
|
private static final Log log = LogFactory.getLog(DbClient.class);
|
||||||
|
|
||||||
private final Connection connection;
|
private final Connection connection;
|
||||||
|
|
||||||
|
@ -37,8 +37,6 @@ public class DbClient implements Closeable {
|
||||||
try (final Statement stmt = connection.createStatement()) {
|
try (final Statement stmt = connection.createStatement()) {
|
||||||
stmt.setFetchSize(100);
|
stmt.setFetchSize(100);
|
||||||
|
|
||||||
log.info("running SQL:\n\n{}\n\n", sql);
|
|
||||||
|
|
||||||
try (final ResultSet rs = stmt.executeQuery(sql)) {
|
try (final ResultSet rs = stmt.executeQuery(sql)) {
|
||||||
while (rs.next()) {
|
while (rs.next()) {
|
||||||
consumer.accept(rs);
|
consumer.accept(rs);
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import okhttp3.MediaType;
|
||||||
|
import okhttp3.RequestBody;
|
||||||
|
import okhttp3.internal.Util;
|
||||||
|
import okio.BufferedSink;
|
||||||
|
import okio.Okio;
|
||||||
|
import okio.Source;
|
||||||
|
|
||||||
|
public class InputStreamRequestBody extends RequestBody {
|
||||||
|
|
||||||
|
private final InputStream inputStream;
|
||||||
|
private final MediaType mediaType;
|
||||||
|
private final long lenght;
|
||||||
|
|
||||||
|
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
|
||||||
|
|
||||||
|
return new InputStreamRequestBody(inputStream, mediaType, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
|
||||||
|
this.inputStream = inputStream;
|
||||||
|
this.mediaType = mediaType;
|
||||||
|
this.lenght = len;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MediaType contentType() {
|
||||||
|
return mediaType;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long contentLength() {
|
||||||
|
|
||||||
|
return lenght;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeTo(BufferedSink sink) throws IOException {
|
||||||
|
Source source = null;
|
||||||
|
try {
|
||||||
|
source = Okio.source(inputStream);
|
||||||
|
sink.writeAll(source);
|
||||||
|
} finally {
|
||||||
|
Util.closeQuietly(source);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,8 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api;
|
||||||
|
|
||||||
|
public class MissingConceptDoiException extends Throwable {
|
||||||
|
public MissingConceptDoiException(String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,363 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.apache.http.HttpHeaders;
|
||||||
|
import org.apache.http.entity.ContentType;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
|
||||||
|
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
|
||||||
|
import okhttp3.*;
|
||||||
|
|
||||||
|
public class ZenodoAPIClient implements Serializable {
|
||||||
|
|
||||||
|
String urlString;
|
||||||
|
String bucket;
|
||||||
|
|
||||||
|
String deposition_id;
|
||||||
|
String access_token;
|
||||||
|
|
||||||
|
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
|
||||||
|
|
||||||
|
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
|
||||||
|
|
||||||
|
public String getUrlString() {
|
||||||
|
return urlString;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setUrlString(String urlString) {
|
||||||
|
this.urlString = urlString;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBucket() {
|
||||||
|
return bucket;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBucket(String bucket) {
|
||||||
|
this.bucket = bucket;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDeposition_id(String deposition_id) {
|
||||||
|
this.deposition_id = deposition_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ZenodoAPIClient(String urlString, String access_token) {
|
||||||
|
|
||||||
|
this.urlString = urlString;
|
||||||
|
this.access_token = access_token;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
|
||||||
|
*
|
||||||
|
* @return response code
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public int newDeposition() throws IOException {
|
||||||
|
String json = "{}";
|
||||||
|
|
||||||
|
URL url = new URL(urlString);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||||
|
conn.setRequestMethod("POST");
|
||||||
|
conn.setDoOutput(true);
|
||||||
|
try (OutputStream os = conn.getOutputStream()) {
|
||||||
|
byte[] input = json.getBytes("utf-8");
|
||||||
|
os.write(input, 0, input.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
String body = getBody(conn);
|
||||||
|
|
||||||
|
int responseCode = conn.getResponseCode();
|
||||||
|
conn.disconnect();
|
||||||
|
|
||||||
|
if (!checkOKStatus(responseCode))
|
||||||
|
throw new IOException("Unexpected code " + responseCode + body);
|
||||||
|
|
||||||
|
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
|
||||||
|
this.bucket = newSubmission.getLinks().getBucket();
|
||||||
|
this.deposition_id = newSubmission.getId();
|
||||||
|
|
||||||
|
return responseCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upload files in Zenodo.
|
||||||
|
*
|
||||||
|
* @param is the inputStream for the file to upload
|
||||||
|
* @param file_name the name of the file as it will appear on Zenodo
|
||||||
|
* @return the response code
|
||||||
|
*/
|
||||||
|
public int uploadIS(InputStream is, String file_name) throws IOException {
|
||||||
|
|
||||||
|
URL url = new URL(bucket + "/" + file_name);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||||
|
conn.setDoOutput(true);
|
||||||
|
conn.setRequestMethod("PUT");
|
||||||
|
|
||||||
|
byte[] buf = new byte[8192];
|
||||||
|
int length;
|
||||||
|
try (OutputStream os = conn.getOutputStream()) {
|
||||||
|
while ((length = is.read(buf)) != -1) {
|
||||||
|
os.write(buf, 0, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
int responseCode = conn.getResponseCode();
|
||||||
|
if (!checkOKStatus(responseCode)) {
|
||||||
|
throw new IOException("Unexpected code " + responseCode + getBody(conn));
|
||||||
|
}
|
||||||
|
|
||||||
|
return responseCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
private String getBody(HttpURLConnection conn) throws IOException {
|
||||||
|
String body = "{}";
|
||||||
|
try (BufferedReader br = new BufferedReader(
|
||||||
|
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
|
||||||
|
StringBuilder response = new StringBuilder();
|
||||||
|
String responseLine = null;
|
||||||
|
while ((responseLine = br.readLine()) != null) {
|
||||||
|
response.append(responseLine.trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
body = response.toString();
|
||||||
|
|
||||||
|
}
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Associates metadata information to the current deposition
|
||||||
|
*
|
||||||
|
* @param metadata the metadata
|
||||||
|
* @return response code
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public int sendMretadata(String metadata) throws IOException {
|
||||||
|
|
||||||
|
URL url = new URL(urlString + "/" + deposition_id);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||||
|
conn.setDoOutput(true);
|
||||||
|
conn.setRequestMethod("PUT");
|
||||||
|
|
||||||
|
try (OutputStream os = conn.getOutputStream()) {
|
||||||
|
byte[] input = metadata.getBytes("utf-8");
|
||||||
|
os.write(input, 0, input.length);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
final int responseCode = conn.getResponseCode();
|
||||||
|
conn.disconnect();
|
||||||
|
if (!checkOKStatus(responseCode))
|
||||||
|
throw new IOException("Unexpected code " + responseCode + getBody(conn));
|
||||||
|
|
||||||
|
return responseCode;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkOKStatus(int responseCode) {
|
||||||
|
|
||||||
|
if (HttpURLConnection.HTTP_OK != responseCode ||
|
||||||
|
HttpURLConnection.HTTP_CREATED != responseCode)
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To publish the current deposition. It works for both new deposition or new version of an old deposition
|
||||||
|
*
|
||||||
|
* @return response code
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public int publish() throws IOException {
|
||||||
|
|
||||||
|
String json = "{}";
|
||||||
|
|
||||||
|
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
|
||||||
|
|
||||||
|
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
|
||||||
|
|
||||||
|
Request request = new Request.Builder()
|
||||||
|
.url(urlString + "/" + deposition_id + "/actions/publish")
|
||||||
|
.addHeader("Authorization", "Bearer " + access_token)
|
||||||
|
.post(body)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try (Response response = httpClient.newCall(request).execute()) {
|
||||||
|
|
||||||
|
if (!response.isSuccessful())
|
||||||
|
throw new IOException("Unexpected code " + response + response.body().string());
|
||||||
|
|
||||||
|
return response.code();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
|
||||||
|
* for the new version.
|
||||||
|
*
|
||||||
|
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
|
||||||
|
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
||||||
|
* concept_rec_id = 656930
|
||||||
|
* @return response code
|
||||||
|
*/
|
||||||
|
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
||||||
|
setDepositionId(concept_rec_id, 1);
|
||||||
|
String json = "{}";
|
||||||
|
|
||||||
|
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||||
|
conn.setDoOutput(true);
|
||||||
|
conn.setRequestMethod("POST");
|
||||||
|
|
||||||
|
try (OutputStream os = conn.getOutputStream()) {
|
||||||
|
byte[] input = json.getBytes("utf-8");
|
||||||
|
os.write(input, 0, input.length);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
String body = getBody(conn);
|
||||||
|
|
||||||
|
int responseCode = conn.getResponseCode();
|
||||||
|
|
||||||
|
conn.disconnect();
|
||||||
|
if (!checkOKStatus(responseCode))
|
||||||
|
throw new IOException("Unexpected code " + responseCode + body);
|
||||||
|
|
||||||
|
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||||
|
String latest_draft = zenodoModel.getLinks().getLatest_draft();
|
||||||
|
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
|
||||||
|
bucket = getBucket(latest_draft);
|
||||||
|
|
||||||
|
return responseCode;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To finish uploading a version or new deposition not published
|
||||||
|
* It sets the deposition_id and the bucket to be used
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @param deposition_id the deposition id of the not yet published upload
|
||||||
|
* concept_rec_id = 656930
|
||||||
|
* @return response code
|
||||||
|
* @throws IOException
|
||||||
|
* @throws MissingConceptDoiException
|
||||||
|
*/
|
||||||
|
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
|
||||||
|
|
||||||
|
this.deposition_id = deposition_id;
|
||||||
|
|
||||||
|
String json = "{}";
|
||||||
|
|
||||||
|
URL url = new URL(urlString + "/" + deposition_id);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||||
|
conn.setRequestMethod("POST");
|
||||||
|
conn.setDoOutput(true);
|
||||||
|
try (OutputStream os = conn.getOutputStream()) {
|
||||||
|
byte[] input = json.getBytes("utf-8");
|
||||||
|
os.write(input, 0, input.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
String body = getBody(conn);
|
||||||
|
|
||||||
|
int responseCode = conn.getResponseCode();
|
||||||
|
conn.disconnect();
|
||||||
|
|
||||||
|
if (!checkOKStatus(responseCode))
|
||||||
|
throw new IOException("Unexpected code " + responseCode + body);
|
||||||
|
|
||||||
|
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||||
|
bucket = zenodoModel.getLinks().getBucket();
|
||||||
|
|
||||||
|
return responseCode;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
|
||||||
|
|
||||||
|
ZenodoModelList zenodoModelList = new Gson()
|
||||||
|
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
|
||||||
|
|
||||||
|
for (ZenodoModel zm : zenodoModelList) {
|
||||||
|
if (zm.getConceptrecid().equals(concept_rec_id)) {
|
||||||
|
deposition_id = zm.getId();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (zenodoModelList.size() == 0)
|
||||||
|
throw new MissingConceptDoiException(
|
||||||
|
"The concept record id specified was missing in the list of depositions");
|
||||||
|
setDepositionId(concept_rec_id, page + 1);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getPrevDepositions(String page) throws IOException {
|
||||||
|
|
||||||
|
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
|
||||||
|
urlBuilder.addQueryParameter("page", page);
|
||||||
|
|
||||||
|
URL url = new URL(urlBuilder.build().toString());
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||||
|
conn.setDoOutput(true);
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
|
||||||
|
String body = getBody(conn);
|
||||||
|
|
||||||
|
int responseCode = conn.getResponseCode();
|
||||||
|
|
||||||
|
conn.disconnect();
|
||||||
|
if (!checkOKStatus(responseCode))
|
||||||
|
throw new IOException("Unexpected code " + responseCode + body);
|
||||||
|
|
||||||
|
return body;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getBucket(String inputUurl) throws IOException {
|
||||||
|
|
||||||
|
URL url = new URL(inputUurl);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||||
|
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||||
|
conn.setDoOutput(true);
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
|
||||||
|
String body = getBody(conn);
|
||||||
|
|
||||||
|
int responseCode = conn.getResponseCode();
|
||||||
|
|
||||||
|
conn.disconnect();
|
||||||
|
if (!checkOKStatus(responseCode))
|
||||||
|
throw new IOException("Unexpected code " + responseCode + body);
|
||||||
|
|
||||||
|
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||||
|
|
||||||
|
return zenodoModel.getLinks().getBucket();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
public class Community {
|
||||||
|
private String identifier;
|
||||||
|
|
||||||
|
public String getIdentifier() {
|
||||||
|
return identifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setIdentifier(String identifier) {
|
||||||
|
this.identifier = identifier;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
public class Creator {
|
||||||
|
private String affiliation;
|
||||||
|
private String name;
|
||||||
|
private String orcid;
|
||||||
|
|
||||||
|
public String getAffiliation() {
|
||||||
|
return affiliation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAffiliation(String affiliation) {
|
||||||
|
this.affiliation = affiliation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setName(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOrcid() {
|
||||||
|
return orcid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOrcid(String orcid) {
|
||||||
|
this.orcid = orcid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Creator newInstance(String name, String affiliation, String orcid) {
|
||||||
|
Creator c = new Creator();
|
||||||
|
if (name != null) {
|
||||||
|
c.name = name;
|
||||||
|
}
|
||||||
|
if (affiliation != null) {
|
||||||
|
c.affiliation = affiliation;
|
||||||
|
}
|
||||||
|
if (orcid != null) {
|
||||||
|
c.orcid = orcid;
|
||||||
|
}
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class File implements Serializable {
|
||||||
|
private String checksum;
|
||||||
|
private String filename;
|
||||||
|
private long filesize;
|
||||||
|
private String id;
|
||||||
|
|
||||||
|
public String getChecksum() {
|
||||||
|
return checksum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setChecksum(String checksum) {
|
||||||
|
this.checksum = checksum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFilename() {
|
||||||
|
return filename;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFilename(String filename) {
|
||||||
|
this.filename = filename;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getFilesize() {
|
||||||
|
return filesize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFilesize(long filesize) {
|
||||||
|
this.filesize = filesize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(String id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class Grant implements Serializable {
|
||||||
|
private String id;
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(String id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Grant newInstance(String id) {
|
||||||
|
Grant g = new Grant();
|
||||||
|
g.id = id;
|
||||||
|
|
||||||
|
return g;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,92 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class Links implements Serializable {
|
||||||
|
|
||||||
|
private String bucket;
|
||||||
|
|
||||||
|
private String discard;
|
||||||
|
|
||||||
|
private String edit;
|
||||||
|
private String files;
|
||||||
|
private String html;
|
||||||
|
private String latest_draft;
|
||||||
|
private String latest_draft_html;
|
||||||
|
private String publish;
|
||||||
|
|
||||||
|
private String self;
|
||||||
|
|
||||||
|
public String getBucket() {
|
||||||
|
return bucket;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBucket(String bucket) {
|
||||||
|
this.bucket = bucket;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDiscard() {
|
||||||
|
return discard;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDiscard(String discard) {
|
||||||
|
this.discard = discard;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEdit() {
|
||||||
|
return edit;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEdit(String edit) {
|
||||||
|
this.edit = edit;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFiles() {
|
||||||
|
return files;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFiles(String files) {
|
||||||
|
this.files = files;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getHtml() {
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHtml(String html) {
|
||||||
|
this.html = html;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLatest_draft() {
|
||||||
|
return latest_draft;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLatest_draft(String latest_draft) {
|
||||||
|
this.latest_draft = latest_draft;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLatest_draft_html() {
|
||||||
|
return latest_draft_html;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLatest_draft_html(String latest_draft_html) {
|
||||||
|
this.latest_draft_html = latest_draft_html;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPublish() {
|
||||||
|
return publish;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPublish(String publish) {
|
||||||
|
this.publish = publish;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSelf() {
|
||||||
|
return self;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSelf(String self) {
|
||||||
|
this.self = self;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,153 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class Metadata implements Serializable {
|
||||||
|
|
||||||
|
private String access_right;
|
||||||
|
private List<Community> communities;
|
||||||
|
private List<Creator> creators;
|
||||||
|
private String description;
|
||||||
|
private String doi;
|
||||||
|
private List<Grant> grants;
|
||||||
|
private List<String> keywords;
|
||||||
|
private String language;
|
||||||
|
private String license;
|
||||||
|
private PrereserveDoi prereserve_doi;
|
||||||
|
private String publication_date;
|
||||||
|
private List<String> references;
|
||||||
|
private List<RelatedIdentifier> related_identifiers;
|
||||||
|
private String title;
|
||||||
|
private String upload_type;
|
||||||
|
private String version;
|
||||||
|
|
||||||
|
public String getUpload_type() {
|
||||||
|
return upload_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setUpload_type(String upload_type) {
|
||||||
|
this.upload_type = upload_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getVersion() {
|
||||||
|
return version;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVersion(String version) {
|
||||||
|
this.version = version;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAccess_right() {
|
||||||
|
return access_right;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAccess_right(String access_right) {
|
||||||
|
this.access_right = access_right;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Community> getCommunities() {
|
||||||
|
return communities;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCommunities(List<Community> communities) {
|
||||||
|
this.communities = communities;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Creator> getCreators() {
|
||||||
|
return creators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCreators(List<Creator> creators) {
|
||||||
|
this.creators = creators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDescription() {
|
||||||
|
return description;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDescription(String description) {
|
||||||
|
this.description = description;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDoi() {
|
||||||
|
return doi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDoi(String doi) {
|
||||||
|
this.doi = doi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Grant> getGrants() {
|
||||||
|
return grants;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setGrants(List<Grant> grants) {
|
||||||
|
this.grants = grants;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getKeywords() {
|
||||||
|
return keywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setKeywords(List<String> keywords) {
|
||||||
|
this.keywords = keywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLanguage() {
|
||||||
|
return language;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLanguage(String language) {
|
||||||
|
this.language = language;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLicense() {
|
||||||
|
return license;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLicense(String license) {
|
||||||
|
this.license = license;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PrereserveDoi getPrereserve_doi() {
|
||||||
|
return prereserve_doi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
|
||||||
|
this.prereserve_doi = prereserve_doi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPublication_date() {
|
||||||
|
return publication_date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPublication_date(String publication_date) {
|
||||||
|
this.publication_date = publication_date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getReferences() {
|
||||||
|
return references;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReferences(List<String> references) {
|
||||||
|
this.references = references;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<RelatedIdentifier> getRelated_identifiers() {
|
||||||
|
return related_identifiers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
|
||||||
|
this.related_identifiers = related_identifiers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTitle() {
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTitle(String title) {
|
||||||
|
this.title = title;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class PrereserveDoi implements Serializable {
|
||||||
|
private String doi;
|
||||||
|
private String recid;
|
||||||
|
|
||||||
|
public String getDoi() {
|
||||||
|
return doi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDoi(String doi) {
|
||||||
|
this.doi = doi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRecid() {
|
||||||
|
return recid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRecid(String recid) {
|
||||||
|
this.recid = recid;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,43 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class RelatedIdentifier implements Serializable {
|
||||||
|
private String identifier;
|
||||||
|
private String relation;
|
||||||
|
private String resource_type;
|
||||||
|
private String scheme;
|
||||||
|
|
||||||
|
public String getIdentifier() {
|
||||||
|
return identifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setIdentifier(String identifier) {
|
||||||
|
this.identifier = identifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRelation() {
|
||||||
|
return relation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRelation(String relation) {
|
||||||
|
this.relation = relation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getResource_type() {
|
||||||
|
return resource_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResource_type(String resource_type) {
|
||||||
|
this.resource_type = resource_type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getScheme() {
|
||||||
|
return scheme;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setScheme(String scheme) {
|
||||||
|
this.scheme = scheme;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,118 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class ZenodoModel implements Serializable {
|
||||||
|
|
||||||
|
private String conceptrecid;
|
||||||
|
private String created;
|
||||||
|
|
||||||
|
private List<File> files;
|
||||||
|
private String id;
|
||||||
|
private Links links;
|
||||||
|
private Metadata metadata;
|
||||||
|
private String modified;
|
||||||
|
private String owner;
|
||||||
|
private String record_id;
|
||||||
|
private String state;
|
||||||
|
private boolean submitted;
|
||||||
|
private String title;
|
||||||
|
|
||||||
|
public String getConceptrecid() {
|
||||||
|
return conceptrecid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setConceptrecid(String conceptrecid) {
|
||||||
|
this.conceptrecid = conceptrecid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCreated() {
|
||||||
|
return created;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCreated(String created) {
|
||||||
|
this.created = created;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<File> getFiles() {
|
||||||
|
return files;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFiles(List<File> files) {
|
||||||
|
this.files = files;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(String id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Links getLinks() {
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLinks(Links links) {
|
||||||
|
this.links = links;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Metadata getMetadata() {
|
||||||
|
return metadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMetadata(Metadata metadata) {
|
||||||
|
this.metadata = metadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getModified() {
|
||||||
|
return modified;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setModified(String modified) {
|
||||||
|
this.modified = modified;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOwner() {
|
||||||
|
return owner;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOwner(String owner) {
|
||||||
|
this.owner = owner;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRecord_id() {
|
||||||
|
return record_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRecord_id(String record_id) {
|
||||||
|
this.record_id = record_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getState() {
|
||||||
|
return state;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setState(String state) {
|
||||||
|
this.state = state;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSubmitted() {
|
||||||
|
return submitted;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSubmitted(boolean submitted) {
|
||||||
|
this.submitted = submitted;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTitle() {
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTitle(String title) {
|
||||||
|
this.title = title;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api.zenodo;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class ZenodoModelList extends ArrayList<ZenodoModel> {
|
||||||
|
}
|
|
@ -212,11 +212,11 @@ public class HttpConnector2 {
|
||||||
.format(
|
.format(
|
||||||
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
||||||
MAPPER.writeValueAsString(report)));
|
MAPPER.writeValueAsString(report)));
|
||||||
} catch (MalformedURLException e) {
|
} catch (MalformedURLException | UnknownHostException e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
report.put(e.getClass().getName(), e.getMessage());
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
throw new CollectorException(e.getMessage(), e);
|
throw new CollectorException(e.getMessage(), e);
|
||||||
} catch (SocketTimeoutException | SocketException | UnknownHostException e) {
|
} catch (SocketTimeoutException | SocketException e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
report.put(e.getClass().getName(), e.getMessage());
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
||||||
|
|
|
@ -1,70 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2024.
|
|
||||||
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
|
|
||||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
||||||
*/
|
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.builder.EqualsBuilder;
|
|
||||||
import org.apache.commons.lang3.builder.HashCodeBuilder;
|
|
||||||
|
|
||||||
public class HashableStructuredProperty extends StructuredProperty {
|
|
||||||
|
|
||||||
private static final long serialVersionUID = 8371670185221126045L;
|
|
||||||
|
|
||||||
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
|
|
||||||
if (value == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
final HashableStructuredProperty sp = new HashableStructuredProperty();
|
|
||||||
sp.setValue(value);
|
|
||||||
sp.setQualifier(qualifier);
|
|
||||||
sp.setDataInfo(dataInfo);
|
|
||||||
return sp;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
|
|
||||||
HashableStructuredProperty hsp = new HashableStructuredProperty();
|
|
||||||
hsp.setQualifier(sp.getQualifier());
|
|
||||||
hsp.setValue(sp.getValue());
|
|
||||||
hsp.setQualifier(sp.getQualifier());
|
|
||||||
return hsp;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
|
|
||||||
StructuredProperty sp = new StructuredProperty();
|
|
||||||
sp.setQualifier(hsp.getQualifier());
|
|
||||||
sp.setValue(hsp.getValue());
|
|
||||||
sp.setQualifier(hsp.getQualifier());
|
|
||||||
return sp;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return new HashCodeBuilder(11, 91)
|
|
||||||
.append(getQualifier().getClassid())
|
|
||||||
.append(getQualifier().getSchemeid())
|
|
||||||
.append(getValue())
|
|
||||||
.hashCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object obj) {
|
|
||||||
if (obj == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (obj == this) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (obj.getClass() != getClass()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
|
|
||||||
return new EqualsBuilder()
|
|
||||||
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
|
|
||||||
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
|
|
||||||
.append(getValue(), rhs.getValue())
|
|
||||||
.isEquals();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -43,4 +43,34 @@ public class CleaningFunctions {
|
||||||
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
|
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method that normalises PID values on a per-type basis.
|
||||||
|
* @param pid the PID whose value will be normalised.
|
||||||
|
* @return the PID containing the normalised value.
|
||||||
|
*/
|
||||||
|
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||||
|
pid
|
||||||
|
.setValue(
|
||||||
|
normalizePidValue(
|
||||||
|
pid.getQualifier().getClassid(),
|
||||||
|
pid.getValue()));
|
||||||
|
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String normalizePidValue(String pidType, String pidValue) {
|
||||||
|
String value = Optional
|
||||||
|
.ofNullable(pidValue)
|
||||||
|
.map(String::trim)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||||
|
|
||||||
|
switch (pidType) {
|
||||||
|
|
||||||
|
// TODO add cleaning for more PID types as needed
|
||||||
|
case "doi":
|
||||||
|
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,11 +6,18 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
public class DoiCleaningRule {
|
public class DoiCleaningRule {
|
||||||
|
|
||||||
public static String clean(final String doi) {
|
public static String clean(final String doi) {
|
||||||
if (doi == null)
|
return doi
|
||||||
return null;
|
.toLowerCase()
|
||||||
final String replaced = doi
|
.replaceAll("\\s", "")
|
||||||
.replaceAll("\\n|\\r|\\t|\\s", "")
|
|
||||||
.replaceAll("^doi:", "")
|
.replaceAll("^doi:", "")
|
||||||
|
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String normalizeDoi(final String input) {
|
||||||
|
if (input == null)
|
||||||
|
return null;
|
||||||
|
final String replaced = input
|
||||||
|
.replaceAll("\\n|\\r|\\t|\\s", "")
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||||
if (StringUtils.isEmpty(replaced))
|
if (StringUtils.isEmpty(replaced))
|
||||||
|
@ -25,6 +32,7 @@ public class DoiCleaningRule {
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -119,7 +119,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.getContext()
|
.getContext()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
|
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
|
||||||
.collect(Collectors.toCollection(ArrayList::new)));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
return (T) res;
|
return (T) res;
|
||||||
} else {
|
} else {
|
||||||
|
@ -563,24 +563,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(i.getPid())
|
.ofNullable(i.getPid())
|
||||||
.ifPresent(pid -> {
|
.ifPresent(pid -> {
|
||||||
final Set<HashableStructuredProperty> pids = pid
|
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
|
||||||
.stream()
|
|
||||||
.map(HashableStructuredProperty::newInstance)
|
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(i.getAlternateIdentifier())
|
.ofNullable(i.getAlternateIdentifier())
|
||||||
.ifPresent(altId -> {
|
.ifPresent(altId -> {
|
||||||
final Set<HashableStructuredProperty> altIds = altId
|
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
|
||||||
.stream()
|
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
|
||||||
.map(HashableStructuredProperty::newInstance)
|
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
|
||||||
i
|
|
||||||
.setAlternateIdentifier(
|
|
||||||
Sets
|
|
||||||
.difference(altIds, pids)
|
|
||||||
.stream()
|
|
||||||
.map(HashableStructuredProperty::toStructuredProperty)
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1015,41 +1003,4 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Implements bad and ugly things that we should get rid of ASAP.
|
|
||||||
*
|
|
||||||
* @param value
|
|
||||||
* @return
|
|
||||||
* @param <T>
|
|
||||||
*/
|
|
||||||
public static <T extends Oaf> T dedicatedUglyHacks(T value) {
|
|
||||||
if (value instanceof OafEntity) {
|
|
||||||
if (value instanceof Result) {
|
|
||||||
final Result r = (Result) value;
|
|
||||||
|
|
||||||
// Fix for AMS Acta
|
|
||||||
Optional
|
|
||||||
.ofNullable(r.getInstance())
|
|
||||||
.map(
|
|
||||||
instance -> instance
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
i -> Optional
|
|
||||||
.ofNullable(i.getHostedby())
|
|
||||||
.map(KeyValue::getKey)
|
|
||||||
.map(dsId -> dsId.equals("10|re3data_____::4cc76bed7ce2fb95fd8e7a2dfde16016"))
|
|
||||||
.orElse(false)))
|
|
||||||
.ifPresent(instance -> instance.forEach(i -> {
|
|
||||||
if (Optional
|
|
||||||
.ofNullable(i.getPid())
|
|
||||||
.map(pid -> pid.stream().noneMatch(p -> p.getValue().startsWith("10.6092/unibo/amsacta")))
|
|
||||||
.orElse(false)) {
|
|
||||||
i.setHostedby(UNKNOWN_REPOSITORY);
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -175,7 +175,7 @@ public class IdentifierFactory implements Serializable {
|
||||||
return entity
|
return entity
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.map(PidCleaner::normalizePidValue)
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
.filter(CleaningFunctions::pidFilter)
|
.filter(CleaningFunctions::pidFilter)
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
|
@ -207,7 +207,7 @@ public class IdentifierFactory implements Serializable {
|
||||||
// filter away PIDs provided by a DS that is not considered an authority for the
|
// filter away PIDs provided by a DS that is not considered an authority for the
|
||||||
// given PID Type
|
// given PID Type
|
||||||
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
|
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
|
||||||
.map(PidCleaner::normalizePidValue)
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
|
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
|
||||||
.filter(CleaningFunctions::pidFilter))
|
.filter(CleaningFunctions::pidFilter))
|
||||||
.orElse(Stream.empty());
|
.orElse(Stream.empty());
|
||||||
|
|
|
@ -96,7 +96,7 @@ public class MergeEntitiesComparator implements Comparator<Oaf> {
|
||||||
// id
|
// id
|
||||||
if (res == 0) {
|
if (res == 0) {
|
||||||
if (left instanceof OafEntity && right instanceof OafEntity) {
|
if (left instanceof OafEntity && right instanceof OafEntity) {
|
||||||
res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId());
|
res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,6 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class MergeUtils {
|
public class MergeUtils {
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
||||||
return mergeGroup(s, oafEntityIterator, true);
|
return mergeGroup(s, oafEntityIterator, true);
|
||||||
}
|
}
|
||||||
|
@ -89,7 +88,7 @@ public class MergeUtils {
|
||||||
private static Oaf mergeEntities(Oaf left, Oaf right, boolean checkDelegatedAuthority) {
|
private static Oaf mergeEntities(Oaf left, Oaf right, boolean checkDelegatedAuthority) {
|
||||||
|
|
||||||
if (sameClass(left, right, Result.class)) {
|
if (sameClass(left, right, Result.class)) {
|
||||||
if (checkDelegatedAuthority) {
|
if (!left.getClass().equals(right.getClass()) || checkDelegatedAuthority) {
|
||||||
return mergeResultsOfDifferentTypes((Result) left, (Result) right);
|
return mergeResultsOfDifferentTypes((Result) left, (Result) right);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -433,10 +432,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
// merge datainfo for same context id
|
// merge datainfo for same context id
|
||||||
merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
|
merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
|
||||||
ArrayList<DataInfo> di = new ArrayList<>();
|
r.getDataInfo().addAll(l.getDataInfo());
|
||||||
di.addAll(r.getDataInfo());
|
|
||||||
di.addAll(l.getDataInfo());
|
|
||||||
r.setDataInfo(di);
|
|
||||||
return r;
|
return r;
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
@ -975,7 +971,7 @@ public class MergeUtils {
|
||||||
private static String extractKeyFromPid(final StructuredProperty pid) {
|
private static String extractKeyFromPid(final StructuredProperty pid) {
|
||||||
if (pid == null)
|
if (pid == null)
|
||||||
return null;
|
return null;
|
||||||
final StructuredProperty normalizedPid = PidCleaner.normalizePidValue(pid);
|
final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid);
|
||||||
|
|
||||||
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
|
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,8 +18,8 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
|
||||||
if (right == null)
|
if (right == null)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
StructuredProperty l = PidCleaner.normalizePidValue(left);
|
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
|
||||||
StructuredProperty r = PidCleaner.normalizePidValue(right);
|
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
|
||||||
|
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(l.getValue())
|
.ofNullable(l.getValue())
|
||||||
|
|
|
@ -28,7 +28,6 @@ import com.jayway.jsonpath.JsonPath;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
|
||||||
import net.minidev.json.JSONArray;
|
import net.minidev.json.JSONArray;
|
||||||
import scala.collection.JavaConverters;
|
import scala.collection.JavaConverters;
|
||||||
import scala.collection.Seq;
|
import scala.collection.Seq;
|
||||||
|
@ -105,7 +104,7 @@ public class DHPUtils {
|
||||||
|
|
||||||
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
|
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
|
||||||
|
|
||||||
final String cleanedPid = PidCleaner.normalizePidValue(pidType, pid);
|
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
|
||||||
|
|
||||||
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
|
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,109 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common.api;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
class ZenodoAPIClientTest {
|
||||||
|
|
||||||
|
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
|
||||||
|
private final String ACCESS_TOKEN = "";
|
||||||
|
|
||||||
|
private final String CONCEPT_REC_ID = "657113";
|
||||||
|
|
||||||
|
private final String depositionId = "674915";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
|
||||||
|
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||||
|
ACCESS_TOKEN);
|
||||||
|
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
|
||||||
|
|
||||||
|
File file = new File(getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
|
||||||
|
.getPath());
|
||||||
|
|
||||||
|
InputStream is = new FileInputStream(file);
|
||||||
|
|
||||||
|
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
|
||||||
|
|
||||||
|
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
|
||||||
|
|
||||||
|
Assertions.assertEquals(200, client.sendMretadata(metadata));
|
||||||
|
|
||||||
|
Assertions.assertEquals(202, client.publish());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testNewDeposition() throws IOException {
|
||||||
|
|
||||||
|
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||||
|
ACCESS_TOKEN);
|
||||||
|
Assertions.assertEquals(201, client.newDeposition());
|
||||||
|
|
||||||
|
File file = new File(getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
|
||||||
|
.getPath());
|
||||||
|
|
||||||
|
InputStream is = new FileInputStream(file);
|
||||||
|
|
||||||
|
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
|
||||||
|
|
||||||
|
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
|
||||||
|
|
||||||
|
Assertions.assertEquals(200, client.sendMretadata(metadata));
|
||||||
|
|
||||||
|
Assertions.assertEquals(202, client.publish());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
|
||||||
|
|
||||||
|
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||||
|
ACCESS_TOKEN);
|
||||||
|
|
||||||
|
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
|
||||||
|
|
||||||
|
File file = new File(getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/common/api/newVersion")
|
||||||
|
.getPath());
|
||||||
|
|
||||||
|
InputStream is = new FileInputStream(file);
|
||||||
|
|
||||||
|
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
|
||||||
|
|
||||||
|
Assertions.assertEquals(202, client.publish());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
|
||||||
|
|
||||||
|
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||||
|
ACCESS_TOKEN);
|
||||||
|
|
||||||
|
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
|
||||||
|
|
||||||
|
File file = new File(getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/common/api/newVersion2")
|
||||||
|
.getPath());
|
||||||
|
|
||||||
|
InputStream is = new FileInputStream(file);
|
||||||
|
|
||||||
|
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
|
||||||
|
|
||||||
|
Assertions.assertEquals(202, client.publish());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -29,7 +29,7 @@ class IdentifierFactoryTest {
|
||||||
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||||
|
|
||||||
verifyIdentifier(
|
verifyIdentifier(
|
||||||
"publication_doi3.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
|
"publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
|
||||||
|
|
||||||
verifyIdentifier(
|
verifyIdentifier(
|
||||||
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
|
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
|
||||||
|
@ -41,7 +41,7 @@ class IdentifierFactoryTest {
|
||||||
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
|
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
|
||||||
|
|
||||||
verifyIdentifier(
|
verifyIdentifier(
|
||||||
"publication_pmc2.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
|
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
|
||||||
|
|
||||||
verifyIdentifier(
|
verifyIdentifier(
|
||||||
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||||
|
|
|
@ -177,7 +177,7 @@ class OafMapperUtilsTest {
|
||||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||||
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID,
|
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
||||||
((Result) MergeUtils
|
((Result) MergeUtils
|
||||||
.merge(p2, d1))
|
.merge(p2, d1))
|
||||||
.getResulttype()
|
.getResulttype()
|
||||||
|
|
|
@ -29,7 +29,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"qualifier": {"classid": "pmc"},
|
"qualifier": {"classid": "pmc"},
|
||||||
"value": "PMC21459329"
|
"value": "21459329"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"qualifier":{"classid":"pmc"},
|
"qualifier":{"classid":"pmc"},
|
||||||
"value":"PMC21459329"
|
"value":"21459329"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
|
||||||
// function for the evaluation of the node
|
// function for the evaluation of the node
|
||||||
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||||
|
|
||||||
TreeNodeStats stats = new TreeNodeStats();
|
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
|
||||||
|
|
||||||
// for each field in the node, it computes the
|
// for each field in the node, it computes the
|
||||||
for (FieldConf fieldConf : fields) {
|
for (FieldConf fieldConf : fields) {
|
||||||
|
|
|
@ -9,8 +9,11 @@ public class TreeNodeStats implements Serializable {
|
||||||
|
|
||||||
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
||||||
|
|
||||||
public TreeNodeStats() {
|
private final boolean ignoreUndefined;
|
||||||
|
|
||||||
|
public TreeNodeStats(boolean ignoreUndefined) {
|
||||||
this.results = new HashMap<>();
|
this.results = new HashMap<>();
|
||||||
|
this.ignoreUndefined = ignoreUndefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, FieldStats> getResults() {
|
public Map<String, FieldStats> getResults() {
|
||||||
|
@ -22,7 +25,10 @@ public class TreeNodeStats implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int fieldsCount() {
|
public int fieldsCount() {
|
||||||
|
if (ignoreUndefined)
|
||||||
return this.results.size();
|
return this.results.size();
|
||||||
|
else
|
||||||
|
return this.results.size() - undefinedCount(); // do not count undefined
|
||||||
}
|
}
|
||||||
|
|
||||||
public int undefinedCount() {
|
public int undefinedCount() {
|
||||||
|
@ -78,12 +84,23 @@ public class TreeNodeStats implements Serializable {
|
||||||
double min = 100.0; // random high value
|
double min = 100.0; // random high value
|
||||||
for (FieldStats fs : this.results.values()) {
|
for (FieldStats fs : this.results.values()) {
|
||||||
if (fs.getResult() < min) {
|
if (fs.getResult() < min) {
|
||||||
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
if (fs.getResult() == -1) {
|
||||||
|
if (fs.isCountIfUndefined()) {
|
||||||
|
min = 0.0;
|
||||||
|
} else {
|
||||||
|
min = -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
min = fs.getResult();
|
min = fs.getResult();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (ignoreUndefined) {
|
||||||
|
return min == -1.0 ? 0.0 : min;
|
||||||
|
} else {
|
||||||
return min;
|
return min;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// if at least one is true, return 1.0
|
// if at least one is true, return 1.0
|
||||||
public double or() {
|
public double or() {
|
||||||
|
@ -91,8 +108,12 @@ public class TreeNodeStats implements Serializable {
|
||||||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||||
return 1.0;
|
return 1.0;
|
||||||
}
|
}
|
||||||
|
if (!ignoreUndefined && undefinedCount() > 0) {
|
||||||
|
return -1.0;
|
||||||
|
} else {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// if at least one is false, return 0.0
|
// if at least one is false, return 0.0
|
||||||
public double and() {
|
public double and() {
|
||||||
|
@ -100,7 +121,7 @@ public class TreeNodeStats implements Serializable {
|
||||||
|
|
||||||
if (fieldStats.getResult() == -1) {
|
if (fieldStats.getResult() == -1) {
|
||||||
if (fieldStats.isCountIfUndefined())
|
if (fieldStats.isCountIfUndefined())
|
||||||
return 0.0;
|
return ignoreUndefined ? 0.0 : -1.0;
|
||||||
} else {
|
} else {
|
||||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
|
@ -44,12 +44,10 @@ public class TreeProcessor {
|
||||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||||
treeStats.addNodeStats(nextNodeName, stats);
|
treeStats.addNodeStats(nextNodeName, stats);
|
||||||
|
|
||||||
// if ignoreUndefined=false the miss is considered as undefined
|
double finalScore = stats.getFinalScore(currentNode.getAggregation());
|
||||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
if (finalScore == -1.0)
|
||||||
nextNodeName = currentNode.getUndefined();
|
nextNodeName = currentNode.getUndefined();
|
||||||
}
|
else if (finalScore >= currentNode.getThreshold()) {
|
||||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
|
||||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
|
||||||
nextNodeName = currentNode.getPositive();
|
nextNodeName = currentNode.getPositive();
|
||||||
} else {
|
} else {
|
||||||
nextNodeName = currentNode.getNegative();
|
nextNodeName = currentNode.getNegative();
|
||||||
|
|
|
@ -135,10 +135,21 @@
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/action_payload_by_type</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/action_payload_by_type</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="PromoteActionPayloadForDatasetTable"/>
|
<ok to="ForkPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<fork name="ForkPromote">
|
||||||
|
<path start="PromoteActionPayloadForDatasetTable"/>
|
||||||
|
<path start="PromoteActionPayloadForDatasourceTable"/>
|
||||||
|
<path start="PromoteActionPayloadForOrganizationTable"/>
|
||||||
|
<path start="PromoteActionPayloadForOtherResearchProductTable"/>
|
||||||
|
<path start="PromoteActionPayloadForProjectTable"/>
|
||||||
|
<path start="PromoteActionPayloadForPublicationTable"/>
|
||||||
|
<path start="PromoteActionPayloadForRelationTable"/>
|
||||||
|
<path start="PromoteActionPayloadForSoftwareTable"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
<action name="PromoteActionPayloadForDatasetTable">
|
<action name="PromoteActionPayloadForDatasetTable">
|
||||||
<sub-workflow>
|
<sub-workflow>
|
||||||
<app-path>${wf:appPath()}/promote_action_payload_for_dataset_table</app-path>
|
<app-path>${wf:appPath()}/promote_action_payload_for_dataset_table</app-path>
|
||||||
|
@ -150,7 +161,7 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="PromoteActionPayloadForDatasourceTable"/>
|
<ok to="JoinPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -165,7 +176,7 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="PromoteActionPayloadForOrganizationTable"/>
|
<ok to="JoinPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -180,7 +191,7 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="PromoteActionPayloadForOtherResearchProductTable"/>
|
<ok to="JoinPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -195,7 +206,7 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="PromoteActionPayloadForProjectTable"/>
|
<ok to="JoinPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -210,7 +221,7 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="PromoteActionPayloadForPublicationTable"/>
|
<ok to="JoinPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -225,7 +236,7 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="PromoteActionPayloadForRelationTable"/>
|
<ok to="JoinPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -240,7 +251,7 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="PromoteActionPayloadForSoftwareTable"/>
|
<ok to="JoinPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -255,9 +266,11 @@
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</sub-workflow>
|
</sub-workflow>
|
||||||
<ok to="End"/>
|
<ok to="JoinPromote"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<join name="JoinPromote" to="End"/>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -1,210 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2024.
|
|
||||||
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
|
|
||||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
||||||
*/
|
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.promote;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.FunctionalInterfaceSupport.*;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
|
|
||||||
import static org.apache.spark.sql.functions.*;
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.DirectoryStream;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.BiFunction;
|
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.sql.*;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.junit.jupiter.api.AfterAll;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
|
|
||||||
public class PromoteResultWithMeasuresTest {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(PromoteResultWithMeasuresTest.class);
|
|
||||||
|
|
||||||
private static SparkSession spark;
|
|
||||||
|
|
||||||
private static Path tempDir;
|
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void beforeAll() throws IOException {
|
|
||||||
tempDir = Files.createTempDirectory(PromoteResultWithMeasuresTest.class.getSimpleName());
|
|
||||||
log.info("using work dir {}", tempDir);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.setMaster("local[*]");
|
|
||||||
conf.setAppName(PromoteResultWithMeasuresTest.class.getSimpleName());
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
|
|
||||||
conf.set("hive.metastore.local", "true");
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
|
|
||||||
conf.set("spark.sql.warehouse.dir", tempDir.toString());
|
|
||||||
conf.set("hive.metastore.warehouse.dir", tempDir.resolve("warehouse").toString());
|
|
||||||
|
|
||||||
spark = SparkSession.builder().config(conf).getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void afterAll() throws IOException {
|
|
||||||
spark.stop();
|
|
||||||
FileUtils.deleteDirectory(tempDir.toFile());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testPromoteResultWithMeasures_job() throws Exception {
|
|
||||||
|
|
||||||
final String inputGraphTablePath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/promote/measures/graph")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
final String inputActionPayloadPath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
final String actionPayloadsPath = tempDir.resolve("actionPayloads").toString();
|
|
||||||
|
|
||||||
spark
|
|
||||||
.read()
|
|
||||||
.text(inputActionPayloadPath)
|
|
||||||
.withColumn("payload", col("value"))
|
|
||||||
.select("payload")
|
|
||||||
.write()
|
|
||||||
.parquet(actionPayloadsPath);
|
|
||||||
|
|
||||||
final Path outputGraphTablePath = tempDir.resolve("outputGraphTablePath");
|
|
||||||
|
|
||||||
PromoteActionPayloadForGraphTableJob
|
|
||||||
.main(new String[] {
|
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
|
||||||
"--graphTableClassName", Publication.class.getCanonicalName(),
|
|
||||||
"--inputGraphTablePath", inputGraphTablePath,
|
|
||||||
"--inputActionPayloadPath", actionPayloadsPath,
|
|
||||||
"--actionPayloadClassName", Result.class.getCanonicalName(),
|
|
||||||
"--outputGraphTablePath", outputGraphTablePath.toString(),
|
|
||||||
"--mergeAndGetStrategy", MergeAndGet.Strategy.MERGE_FROM_AND_GET.toString(),
|
|
||||||
"--promoteActionStrategy", PromoteAction.Strategy.ENRICH.toString(),
|
|
||||||
"--shouldGroupById", "true"
|
|
||||||
});
|
|
||||||
|
|
||||||
assertFalse(isDirEmpty(outputGraphTablePath));
|
|
||||||
|
|
||||||
final Encoder<Publication> pubEncoder = Encoders.bean(Publication.class);
|
|
||||||
List<Publication> results = spark
|
|
||||||
.read()
|
|
||||||
.schema(pubEncoder.schema())
|
|
||||||
.json(outputGraphTablePath.toString())
|
|
||||||
.as(pubEncoder)
|
|
||||||
.collectAsList();
|
|
||||||
|
|
||||||
verify(results);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testPromoteResultWithMeasures_internal() throws JsonProcessingException {
|
|
||||||
|
|
||||||
Dataset<Publication> rowDS = spark
|
|
||||||
.read()
|
|
||||||
.schema(Encoders.bean(Publication.class).schema())
|
|
||||||
.json("src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/graph")
|
|
||||||
.as(Encoders.bean(Publication.class));
|
|
||||||
|
|
||||||
Dataset<Result> actionPayloadDS = spark
|
|
||||||
.read()
|
|
||||||
.schema(Encoders.bean(Result.class).schema())
|
|
||||||
.json("src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads")
|
|
||||||
.as(Encoders.bean(Result.class));
|
|
||||||
|
|
||||||
final MergeAndGet.Strategy mergeFromAndGet = MergeAndGet.Strategy.MERGE_FROM_AND_GET;
|
|
||||||
|
|
||||||
final SerializableSupplier<Function<Publication, String>> rowIdFn = ModelSupport::idFn;
|
|
||||||
final SerializableSupplier<BiFunction<Publication, Result, Publication>> mergeAndGetFn = MergeAndGet
|
|
||||||
.functionFor(mergeFromAndGet);
|
|
||||||
final SerializableSupplier<Publication> zeroFn = () -> Publication.class
|
|
||||||
.cast(new eu.dnetlib.dhp.schema.oaf.Publication());
|
|
||||||
final SerializableSupplier<Function<Publication, Boolean>> isNotZeroFn = PromoteResultWithMeasuresTest::isNotZeroFnUsingIdOrSourceAndTarget;
|
|
||||||
|
|
||||||
Dataset<Publication> joinedResults = PromoteActionPayloadFunctions
|
|
||||||
.joinGraphTableWithActionPayloadAndMerge(
|
|
||||||
rowDS,
|
|
||||||
actionPayloadDS,
|
|
||||||
rowIdFn,
|
|
||||||
ModelSupport::idFn,
|
|
||||||
mergeAndGetFn,
|
|
||||||
PromoteAction.Strategy.ENRICH,
|
|
||||||
Publication.class,
|
|
||||||
Result.class);
|
|
||||||
|
|
||||||
SerializableSupplier<BiFunction<Publication, Publication, Publication>> mergeRowsAndGetFn = MergeAndGet
|
|
||||||
.functionFor(mergeFromAndGet);
|
|
||||||
|
|
||||||
Dataset<Publication> mergedResults = PromoteActionPayloadFunctions
|
|
||||||
.groupGraphTableByIdAndMerge(
|
|
||||||
joinedResults, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, Publication.class);
|
|
||||||
|
|
||||||
verify(mergedResults.collectAsList());
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void verify(List<Publication> results) throws JsonProcessingException {
|
|
||||||
assertNotNull(results);
|
|
||||||
assertEquals(1, results.size());
|
|
||||||
|
|
||||||
Result r = results.get(0);
|
|
||||||
|
|
||||||
log.info(OBJECT_MAPPER.writeValueAsString(r));
|
|
||||||
|
|
||||||
assertNotNull(r.getMeasures());
|
|
||||||
assertFalse(r.getMeasures().isEmpty());
|
|
||||||
assertTrue(
|
|
||||||
r
|
|
||||||
.getMeasures()
|
|
||||||
.stream()
|
|
||||||
.map(Measure::getId)
|
|
||||||
.collect(Collectors.toCollection(HashSet::new))
|
|
||||||
.containsAll(
|
|
||||||
Lists
|
|
||||||
.newArrayList(
|
|
||||||
"downloads", "views", "influence", "popularity", "influence_alt", "popularity_alt",
|
|
||||||
"impulse")));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSourceAndTarget() {
|
|
||||||
return t -> {
|
|
||||||
if (isSubClass(t, Relation.class)) {
|
|
||||||
final Relation rel = (Relation) t;
|
|
||||||
return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget());
|
|
||||||
}
|
|
||||||
return StringUtils.isNotBlank(((OafEntity) t).getId());
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isDirEmpty(final Path directory) throws IOException {
|
|
||||||
try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(directory)) {
|
|
||||||
return !dirStream.iterator().hasNext();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,3 +0,0 @@
|
||||||
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":[{"id":"downloads","unit":[{"key":"opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO","value":"125","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:usage_counts","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"views","unit":[{"key":"opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO","value":"35","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:usage_counts","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":null,"publiclyFunded":null,"transformativeAgreement":null,"isGreen":null,"isInDiamondJournal":null}
|
|
||||||
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":[{"id":"influence","unit":[{"key":"score","value":"3.1167566E-9","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity","unit":[{"key":"score","value":"7.335433E-9","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"influence_alt","unit":[{"key":"score","value":"4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity_alt","unit":[{"key":"score","value":"2.96","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"impulse","unit":[{"key":"score","value":"4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":null,"publiclyFunded":null,"transformativeAgreement":null,"isGreen":null,"isInDiamondJournal":null}
|
|
||||||
{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":"hybrid","publiclyFunded":false,"transformativeAgreement":null,"isGreen":true,"isInDiamondJournal":false}
|
|
File diff suppressed because one or more lines are too long
|
@ -10,6 +10,7 @@ import java.util.List;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.BZip2Codec;
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
@ -28,7 +29,6 @@ import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.DoiCleaningRule;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
@ -44,10 +44,6 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
|
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
|
||||||
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
|
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
|
||||||
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
|
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
|
||||||
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
|
|
||||||
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
|
|
||||||
public static final String DOI_URL_PREFIX = "https://doi.org/";
|
|
||||||
public static final int DOI_URL_PREFIX_LENGTH = 16;
|
|
||||||
|
|
||||||
public static <I extends Result> void main(String[] args) throws Exception {
|
public static <I extends Result> void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -78,9 +74,6 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
||||||
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
||||||
|
|
||||||
final String publisherInputPath = parser.get("publisherInputPath");
|
|
||||||
log.info("publisherInputPath: {}", publisherInputPath);
|
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
@ -91,72 +84,41 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
Constants.removeOutputDir(spark, outputPath);
|
Constants.removeOutputDir(spark, outputPath);
|
||||||
createActionSet(
|
|
||||||
spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath,
|
|
||||||
publisherInputPath, outputPath);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
|
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
||||||
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
|
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
||||||
String outputPath) {
|
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
||||||
List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
|
spark, crossrefInputPath, collectedFromCrossref);
|
||||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
|
||||||
spark, crossrefInputPath, collectedfromOpenAIRE);
|
|
||||||
|
|
||||||
|
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
||||||
|
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
||||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||||
spark, pubmedInputPath, collectedfromOpenAIRE);
|
spark, pubmedInputPath, collectedFromPubmed);
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
||||||
spark, openapcInputPath, collectedfromOpenAIRE);
|
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
||||||
|
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
||||||
|
spark, openapcInputPath, collectedFromOpenAPC);
|
||||||
|
|
||||||
|
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
||||||
|
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
||||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||||
spark, dataciteInputPath, collectedfromOpenAIRE);
|
spark, dataciteInputPath, collectedFromDatacite);
|
||||||
|
|
||||||
|
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||||
|
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
|
||||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||||
spark, webcrawlInputPath, collectedfromOpenAIRE);
|
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
|
||||||
spark, publisherlInputPath, collectedfromOpenAIRE);
|
|
||||||
|
|
||||||
crossrefRelations
|
crossrefRelations
|
||||||
.union(pubmedRelations)
|
.union(pubmedRelations)
|
||||||
.union(openAPCRelations)
|
.union(openAPCRelations)
|
||||||
.union(dataciteRelations)
|
.union(dataciteRelations)
|
||||||
.union(webCrawlRelations)
|
.union(webCrawlRelations)
|
||||||
.union(publisherRelations)
|
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
}
|
|
||||||
|
|
||||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
|
|
||||||
String inputPath,
|
|
||||||
List<KeyValue> collectedfrom) {
|
|
||||||
|
|
||||||
Dataset<Row> df = spark
|
|
||||||
.read()
|
|
||||||
.schema(
|
|
||||||
"`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
|
|
||||||
.json(inputPath)
|
|
||||||
.where("DOI is not null");
|
|
||||||
|
|
||||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
|
||||||
List<KeyValue> collectedfrom) {
|
|
||||||
|
|
||||||
Dataset<Row> df = spark
|
|
||||||
.read()
|
|
||||||
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
|
||||||
.json(inputPath)
|
|
||||||
.where("DOI is not null");
|
|
||||||
|
|
||||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
|
||||||
|
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
||||||
|
@ -167,27 +129,8 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
Dataset<Row> df = spark
|
Dataset<Row> df = spark
|
||||||
.read()
|
.read()
|
||||||
.schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
.schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
||||||
.json(inputPath)
|
.json(inputPath);
|
||||||
.where("DOI is not null");
|
|
||||||
|
|
||||||
return getTextTextJavaPairRDD(collectedfrom, df);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
|
|
||||||
String inputPath,
|
|
||||||
List<KeyValue> collectedfrom) {
|
|
||||||
// load and parse affiliation relations from HDFS
|
|
||||||
Dataset<Row> df = spark
|
|
||||||
.read()
|
|
||||||
.schema(
|
|
||||||
"`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
|
|
||||||
.json(inputPath)
|
|
||||||
.where("DOI is not null");
|
|
||||||
|
|
||||||
return getTextTextJavaPairRDDNew(collectedfrom, df);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
|
||||||
// unroll nested arrays
|
// unroll nested arrays
|
||||||
df = df
|
df = df
|
||||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||||
|
@ -203,7 +146,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
|
|
||||||
// DOI to OpenAIRE id
|
// DOI to OpenAIRE id
|
||||||
final String paperId = ID_PREFIX
|
final String paperId = ID_PREFIX
|
||||||
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
||||||
|
|
||||||
// ROR id to OpenAIRE id
|
// ROR id to OpenAIRE id
|
||||||
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
||||||
|
@ -235,69 +178,6 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
|
||||||
// unroll nested arrays
|
|
||||||
df = df
|
|
||||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
|
||||||
.select(
|
|
||||||
new Column("DOI").as("doi"),
|
|
||||||
new Column("matching.PID").as("pidtype"),
|
|
||||||
new Column("matching.Value").as("pidvalue"),
|
|
||||||
new Column("matching.Confidence").as("confidence"),
|
|
||||||
new Column("matching.Status").as("status"))
|
|
||||||
.where("status = 'active'");
|
|
||||||
|
|
||||||
// prepare action sets for affiliation relations
|
|
||||||
return df
|
|
||||||
.toJavaRDD()
|
|
||||||
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
|
||||||
|
|
||||||
// DOI to OpenAIRE id
|
|
||||||
final String paperId = ID_PREFIX
|
|
||||||
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
|
|
||||||
|
|
||||||
// Organization to OpenAIRE identifier
|
|
||||||
String affId = null;
|
|
||||||
if (row.getAs("pidtype").equals("ROR"))
|
|
||||||
// ROR id to OpenIARE id
|
|
||||||
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
|
|
||||||
else
|
|
||||||
// getting the OpenOrgs identifier for the organization
|
|
||||||
affId = row.getAs("pidvalue");
|
|
||||||
|
|
||||||
Qualifier qualifier = OafMapperUtils
|
|
||||||
.qualifier(
|
|
||||||
BIP_AFFILIATIONS_CLASSID,
|
|
||||||
BIP_AFFILIATIONS_CLASSNAME,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
|
||||||
|
|
||||||
// format data info; setting `confidence` into relation's `trust`
|
|
||||||
DataInfo dataInfo = OafMapperUtils
|
|
||||||
.dataInfo(
|
|
||||||
false,
|
|
||||||
BIP_INFERENCE_PROVENANCE,
|
|
||||||
true,
|
|
||||||
false,
|
|
||||||
qualifier,
|
|
||||||
Double.toString(row.getAs("confidence")));
|
|
||||||
|
|
||||||
// return bi-directional relations
|
|
||||||
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
|
||||||
|
|
||||||
})
|
|
||||||
.map(p -> new AtomicAction(Relation.class, p))
|
|
||||||
.mapToPair(
|
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String removePrefix(String doi) {
|
|
||||||
if (doi.startsWith(DOI_URL_PREFIX))
|
|
||||||
return doi.substring(DOI_URL_PREFIX_LENGTH);
|
|
||||||
return doi;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
||||||
DataInfo dataInfo) {
|
DataInfo dataInfo) {
|
||||||
return Arrays
|
return Arrays
|
||||||
|
|
|
@ -49,9 +49,6 @@ public class ReadCOCI implements Serializable {
|
||||||
final String workingPath = parser.get("inputPath");
|
final String workingPath = parser.get("inputPath");
|
||||||
log.info("workingPath {}", workingPath);
|
log.info("workingPath {}", workingPath);
|
||||||
|
|
||||||
final String backupPath = parser.get("backupPath");
|
|
||||||
log.info("backupPath {}", backupPath);
|
|
||||||
|
|
||||||
SparkConf sconf = new SparkConf();
|
SparkConf sconf = new SparkConf();
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
|
@ -71,14 +68,12 @@ public class ReadCOCI implements Serializable {
|
||||||
workingPath,
|
workingPath,
|
||||||
fileSystem,
|
fileSystem,
|
||||||
outputPath,
|
outputPath,
|
||||||
backupPath,
|
|
||||||
delimiter);
|
delimiter);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
||||||
String outputPath,
|
String outputPath,
|
||||||
String backupPath,
|
|
||||||
String delimiter) throws IOException {
|
String delimiter) throws IOException {
|
||||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||||
.listFiles(
|
.listFiles(
|
||||||
|
@ -112,8 +107,7 @@ public class ReadCOCI implements Serializable {
|
||||||
.mode(SaveMode.Append)
|
.mode(SaveMode.Append)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
|
fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE"));
|
||||||
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,6 +13,7 @@ import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.BZip2Codec;
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.*;
|
import org.apache.spark.api.java.function.*;
|
||||||
|
@ -24,6 +25,8 @@ import org.spark_project.jetty.util.StringUtil;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.actionmanager.Constants;
|
||||||
|
import eu.dnetlib.dhp.actionmanager.transformativeagreement.model.TransformativeAgreementModel;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.collection.orcid.model.Author;
|
import eu.dnetlib.dhp.collection.orcid.model.Author;
|
||||||
import eu.dnetlib.dhp.collection.orcid.model.Employment;
|
import eu.dnetlib.dhp.collection.orcid.model.Employment;
|
||||||
|
@ -34,6 +37,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Person;
|
import eu.dnetlib.dhp.schema.oaf.Person;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Pid;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
@ -177,20 +181,10 @@ public class ExtractPerson implements Serializable {
|
||||||
.map(
|
.map(
|
||||||
v -> v
|
v -> v
|
||||||
.stream()
|
.stream()
|
||||||
.map(
|
.map(p -> Pid.newInstance(p.getSchema(), p.getValue()))
|
||||||
p -> OafMapperUtils
|
|
||||||
.structuredProperty(
|
|
||||||
p.getValue(), p.getSchema(), p.getSchema(), ModelConstants.DNET_PID_TYPES,
|
|
||||||
ModelConstants.DNET_PID_TYPES, null))
|
|
||||||
.collect(Collectors.toList()))
|
.collect(Collectors.toList()))
|
||||||
.orElse(new ArrayList<>()));
|
.orElse(new ArrayList<>()));
|
||||||
person
|
person.getPid().add(Pid.newInstance(ModelConstants.ORCID, op.getOrcid()));
|
||||||
.getPid()
|
|
||||||
.add(
|
|
||||||
OafMapperUtils
|
|
||||||
.structuredProperty(
|
|
||||||
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
|
|
||||||
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
|
|
||||||
person.setDateofcollection(op.getLastModifiedDate());
|
person.setDateofcollection(op.getLastModifiedDate());
|
||||||
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
||||||
return person;
|
return person;
|
||||||
|
|
|
@ -8,7 +8,7 @@ import java.util.*;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.BZip2Codec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
@ -112,7 +112,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,7 +148,8 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
|
|
||||||
return spark
|
return spark
|
||||||
.read()
|
.read()
|
||||||
.json(inputPath)
|
.option("header", true)
|
||||||
|
.csv(inputPath)
|
||||||
.select("OpenAlexId");
|
.select("OpenAlexId");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,158 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
import static org.apache.spark.sql.functions.*;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.io.filefilter.DirectoryFileFilter;
|
|
||||||
import org.apache.commons.io.filefilter.FileFileFilter;
|
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
import org.apache.hadoop.io.compress.BZip2Codec;
|
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.*;
|
|
||||||
import org.apache.spark.sql.types.*;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
public class RemoveRelationFromActionSet
|
|
||||||
implements Serializable {
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CreateActionSetFromWebEntries.class);
|
|
||||||
|
|
||||||
private static final ObjectMapper MAPPER = new ObjectMapper();
|
|
||||||
private static final StructType KV_SCHEMA = StructType$.MODULE$
|
|
||||||
.apply(
|
|
||||||
Arrays
|
|
||||||
.asList(
|
|
||||||
StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
|
|
||||||
StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())));
|
|
||||||
|
|
||||||
private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$
|
|
||||||
.apply(
|
|
||||||
Arrays
|
|
||||||
.asList(
|
|
||||||
StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()),
|
|
||||||
StructField$.MODULE$
|
|
||||||
.apply(
|
|
||||||
"payload", DataTypes.StringType, false, Metadata.empty())));
|
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
String jsonConfiguration = IOUtils
|
|
||||||
.toString(
|
|
||||||
CreateActionSetFromWebEntries.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json"));
|
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
// the actionSet path
|
|
||||||
final String inputPath = parser.get("sourcePath");
|
|
||||||
log.info("inputPath: {}", inputPath);
|
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
|
||||||
log.info("outputPath: {}", outputPath);
|
|
||||||
|
|
||||||
final String blackListInputPath = parser.get("blackListPath");
|
|
||||||
log.info("blackListInputPath: {}", blackListInputPath);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
|
|
||||||
removeFromActionSet(spark, inputPath, outputPath, blackListInputPath);
|
|
||||||
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void removeFromActionSet(SparkSession spark, String inputPath, String outputPath,
|
|
||||||
String blackListInputPath) {
|
|
||||||
// read the blacklist
|
|
||||||
Dataset<String> blackList = readBlackList(spark, blackListInputPath)
|
|
||||||
.map(
|
|
||||||
(MapFunction<Row, String>) r -> IdentifierFactory
|
|
||||||
.idFromPid("50", "doi", ((String) r.getAs("doi")).substring(16), true),
|
|
||||||
Encoders.STRING());
|
|
||||||
|
|
||||||
// read the old actionset and get the relations in the payload
|
|
||||||
JavaPairRDD<Text, Text> seq = JavaSparkContext
|
|
||||||
.fromSparkContext(spark.sparkContext())
|
|
||||||
.sequenceFile(inputPath, Text.class, Text.class);
|
|
||||||
|
|
||||||
JavaRDD<Row> rdd = seq
|
|
||||||
.map(x -> RowFactory.create(x._1().toString(), x._2().toString()));
|
|
||||||
|
|
||||||
Dataset<Row> actionSet = spark
|
|
||||||
.createDataFrame(rdd, KV_SCHEMA)
|
|
||||||
.withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA))
|
|
||||||
.select(expr("atomic_action.*"));
|
|
||||||
|
|
||||||
Dataset<Relation> relation = actionSet
|
|
||||||
.map(
|
|
||||||
(MapFunction<Row, Relation>) r -> MAPPER.readValue((String) r.getAs("payload"), Relation.class),
|
|
||||||
Encoders.bean(Relation.class));
|
|
||||||
|
|
||||||
// select only the relation not matching any pid in the blacklist as source for the relation
|
|
||||||
Dataset<Relation> relNoSource = relation
|
|
||||||
.joinWith(blackList, relation.col("source").equalTo(blackList.col("value")), "left")
|
|
||||||
.filter((FilterFunction<Tuple2<Relation, String>>) t2 -> t2._2() == null)
|
|
||||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class));
|
|
||||||
|
|
||||||
// select only the relation not matching any pid in the blacklist as target of the relation
|
|
||||||
relNoSource
|
|
||||||
.joinWith(blackList, relNoSource.col("target").equalTo(blackList.col("value")), "left")
|
|
||||||
.filter((FilterFunction<Tuple2<Relation, String>>) t2 -> t2._2() == null)
|
|
||||||
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
|
|
||||||
.toJavaRDD()
|
|
||||||
.map(p -> new AtomicAction(p.getClass(), p))
|
|
||||||
.mapToPair(
|
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
|
||||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
|
||||||
;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
|
|
||||||
|
|
||||||
return spark
|
|
||||||
.read()
|
|
||||||
.json(inputPath)
|
|
||||||
.select("doi");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -22,11 +22,9 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.plugin.gtr2.Gtr2PublicationsCollectorPlugin;
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.plugin.osf.OsfPreprintsCollectorPlugin;
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
@ -60,7 +58,7 @@ public class CollectorWorker extends ReportingJob {
|
||||||
|
|
||||||
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
|
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
|
||||||
|
|
||||||
final String outputPath = this.mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
||||||
log.info("outputPath path is {}", outputPath);
|
log.info("outputPath path is {}", outputPath);
|
||||||
|
|
||||||
final CollectorPlugin plugin = getCollectorPlugin();
|
final CollectorPlugin plugin = getCollectorPlugin();
|
||||||
|
@ -70,36 +68,36 @@ public class CollectorWorker extends ReportingJob {
|
||||||
|
|
||||||
try (SequenceFile.Writer writer = SequenceFile
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
.createWriter(
|
.createWriter(
|
||||||
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
fileSystem.getConf(),
|
||||||
.keyClass(IntWritable.class),
|
SequenceFile.Writer.file(new Path(outputPath)),
|
||||||
SequenceFile.Writer
|
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||||
.valueClass(Text.class),
|
SequenceFile.Writer.valueClass(Text.class),
|
||||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||||
final IntWritable key = new IntWritable(counter.get());
|
final IntWritable key = new IntWritable(counter.get());
|
||||||
final Text value = new Text();
|
final Text value = new Text();
|
||||||
plugin
|
plugin
|
||||||
.collect(this.api, this.report)
|
.collect(api, report)
|
||||||
.forEach(content -> {
|
.forEach(
|
||||||
|
content -> {
|
||||||
key.set(counter.getAndIncrement());
|
key.set(counter.getAndIncrement());
|
||||||
value.set(content);
|
value.set(content);
|
||||||
try {
|
try {
|
||||||
writer.append(key, value);
|
writer.append(key, value);
|
||||||
} catch (final Throwable e) {
|
} catch (Throwable e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} catch (final Throwable e) {
|
} catch (Throwable e) {
|
||||||
this.report.put(e.getClass().getName(), e.getMessage());
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
throw new CollectorException(e);
|
throw new CollectorException(e);
|
||||||
} finally {
|
} finally {
|
||||||
shutdown();
|
shutdown();
|
||||||
this.report.ongoing(counter.longValue(), counter.longValue());
|
report.ongoing(counter.longValue(), counter.longValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void scheduleReport(final AtomicInteger counter) {
|
private void scheduleReport(AtomicInteger counter) {
|
||||||
schedule(new ReporterCallback() {
|
schedule(new ReporterCallback() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Long getCurrent() {
|
public Long getCurrent() {
|
||||||
return counter.longValue();
|
return counter.longValue();
|
||||||
|
@ -114,37 +112,33 @@ public class CollectorWorker extends ReportingJob {
|
||||||
|
|
||||||
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
|
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
|
||||||
|
|
||||||
switch (CollectorPlugin.NAME.valueOf(this.api.getProtocol())) {
|
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
|
||||||
case oai:
|
case oai:
|
||||||
return new OaiCollectorPlugin(this.clientParams);
|
return new OaiCollectorPlugin(clientParams);
|
||||||
case rest_json2xml:
|
case rest_json2xml:
|
||||||
return new RestCollectorPlugin(this.clientParams);
|
return new RestCollectorPlugin(clientParams);
|
||||||
case file:
|
case file:
|
||||||
return new FileCollectorPlugin(this.fileSystem);
|
return new FileCollectorPlugin(fileSystem);
|
||||||
case fileGzip:
|
case fileGzip:
|
||||||
return new FileGZipCollectorPlugin(this.fileSystem);
|
return new FileGZipCollectorPlugin(fileSystem);
|
||||||
case baseDump:
|
case baseDump:
|
||||||
return new BaseCollectorPlugin(this.fileSystem);
|
return new BaseCollectorPlugin(this.fileSystem);
|
||||||
case gtr2Publications:
|
|
||||||
return new Gtr2PublicationsCollectorPlugin(this.clientParams);
|
|
||||||
case osfPreprints:
|
|
||||||
return new OsfPreprintsCollectorPlugin(this.clientParams);
|
|
||||||
case other:
|
case other:
|
||||||
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
||||||
.ofNullable(this.api.getParams().get("other_plugin_type"))
|
.ofNullable(api.getParams().get("other_plugin_type"))
|
||||||
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
|
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
|
||||||
.orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type"));
|
.orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type"));
|
||||||
|
|
||||||
switch (plugin) {
|
switch (plugin) {
|
||||||
case mdstore_mongodb_dump:
|
case mdstore_mongodb_dump:
|
||||||
return new MongoDbDumpCollectorPlugin(this.fileSystem);
|
return new MongoDbDumpCollectorPlugin(fileSystem);
|
||||||
case mdstore_mongodb:
|
case mdstore_mongodb:
|
||||||
return new MDStoreCollectorPlugin();
|
return new MDStoreCollectorPlugin();
|
||||||
default:
|
default:
|
||||||
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
|
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
throw new UnknownCollectorPluginException("protocol is not managed: " + this.api.getProtocol());
|
throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ public interface CollectorPlugin {
|
||||||
|
|
||||||
enum NAME {
|
enum NAME {
|
||||||
|
|
||||||
oai, other, rest_json2xml, file, fileGzip, baseDump, gtr2Publications, osfPreprints;
|
oai, other, rest_json2xml, file, fileGzip, baseDump;
|
||||||
|
|
||||||
public enum OTHER_NAME {
|
public enum OTHER_NAME {
|
||||||
mdstore_mongodb_dump, mdstore_mongodb
|
mdstore_mongodb_dump, mdstore_mongodb
|
||||||
|
|
|
@ -1,43 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.gtr2;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Spliterator;
|
|
||||||
import java.util.Spliterators;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
import java.util.stream.StreamSupport;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
|
||||||
|
|
||||||
public class Gtr2PublicationsCollectorPlugin implements CollectorPlugin {
|
|
||||||
|
|
||||||
private final HttpClientParams clientParams;
|
|
||||||
|
|
||||||
public Gtr2PublicationsCollectorPlugin(final HttpClientParams clientParams) {
|
|
||||||
this.clientParams = clientParams;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
|
||||||
|
|
||||||
final String baseUrl = api.getBaseUrl();
|
|
||||||
final String startPage = api.getParams().get("startPage");
|
|
||||||
final String endPage = api.getParams().get("endPage");
|
|
||||||
final String fromDate = api.getParams().get("fromDate");
|
|
||||||
|
|
||||||
if ((fromDate != null) && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
|
||||||
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
|
|
||||||
}
|
|
||||||
|
|
||||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseUrl, fromDate, startPage, endPage,
|
|
||||||
this.clientParams);
|
|
||||||
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
|
|
||||||
|
|
||||||
return StreamSupport.stream(spliterator, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,215 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.gtr2;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Queue;
|
|
||||||
import java.util.function.Function;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
|
||||||
import org.dom4j.Document;
|
|
||||||
import org.dom4j.DocumentException;
|
|
||||||
import org.dom4j.DocumentHelper;
|
|
||||||
import org.dom4j.Element;
|
|
||||||
import org.joda.time.DateTime;
|
|
||||||
import org.joda.time.format.DateTimeFormat;
|
|
||||||
import org.joda.time.format.DateTimeFormatter;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpConnector2;
|
|
||||||
|
|
||||||
public class Gtr2PublicationsIterator implements Iterator<String> {
|
|
||||||
|
|
||||||
public static final int PAGE_SIZE = 20;
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(Gtr2PublicationsIterator.class);
|
|
||||||
|
|
||||||
private final HttpConnector2 connector;
|
|
||||||
private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
|
|
||||||
|
|
||||||
private static final int MAX_ATTEMPTS = 10;
|
|
||||||
|
|
||||||
private final String baseUrl;
|
|
||||||
private int currPage;
|
|
||||||
private int endPage;
|
|
||||||
private boolean incremental = false;
|
|
||||||
private DateTime fromDate;
|
|
||||||
|
|
||||||
private final Map<String, String> cache = new HashMap<>();
|
|
||||||
|
|
||||||
private final Queue<String> queue = new LinkedList<>();
|
|
||||||
|
|
||||||
private String nextElement;
|
|
||||||
|
|
||||||
public Gtr2PublicationsIterator(final String baseUrl, final String fromDate, final String startPage,
|
|
||||||
final String endPage,
|
|
||||||
final HttpClientParams clientParams)
|
|
||||||
throws CollectorException {
|
|
||||||
|
|
||||||
this.baseUrl = baseUrl;
|
|
||||||
this.currPage = NumberUtils.toInt(startPage, 1);
|
|
||||||
this.endPage = NumberUtils.toInt(endPage, Integer.MAX_VALUE);
|
|
||||||
this.incremental = StringUtils.isNotBlank(fromDate);
|
|
||||||
this.connector = new HttpConnector2(clientParams);
|
|
||||||
|
|
||||||
if (this.incremental) {
|
|
||||||
this.fromDate = parseDate(fromDate);
|
|
||||||
}
|
|
||||||
|
|
||||||
prepareNextElement();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
return this.nextElement != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String next() {
|
|
||||||
try {
|
|
||||||
return this.nextElement;
|
|
||||||
} finally {
|
|
||||||
prepareNextElement();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void prepareNextElement() {
|
|
||||||
while ((this.currPage <= this.endPage) && this.queue.isEmpty()) {
|
|
||||||
log.debug("FETCHING PAGE + " + this.currPage + "/" + this.endPage);
|
|
||||||
this.queue.addAll(fetchPage(this.currPage++));
|
|
||||||
}
|
|
||||||
this.nextElement = this.queue.poll();
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> fetchPage(final int pageNumber) {
|
|
||||||
|
|
||||||
final List<String> res = new ArrayList<>();
|
|
||||||
try {
|
|
||||||
final Document doc = loadURL(cleanURL(this.baseUrl + "/outcomes/publications?p=" + pageNumber), 0);
|
|
||||||
|
|
||||||
if (this.endPage == Integer.MAX_VALUE) {
|
|
||||||
this.endPage = NumberUtils.toInt(doc.valueOf("/*/@*[local-name() = 'totalPages']"));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (final Object po : doc.selectNodes("//*[local-name() = 'publication']")) {
|
|
||||||
final Element mainEntity = (Element) ((Element) po).detach();
|
|
||||||
|
|
||||||
if (filterIncremental(mainEntity)) {
|
|
||||||
res.add(expandMainEntity(mainEntity));
|
|
||||||
} else {
|
|
||||||
log.debug("Skipped entity");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
} catch (final Throwable e) {
|
|
||||||
log.error("Exception fetching page " + pageNumber, e);
|
|
||||||
throw new RuntimeException("Exception fetching page " + pageNumber, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addLinkedEntities(final Element master, final String relType, final Element newRoot,
|
|
||||||
final Function<Document, Element> mapper) {
|
|
||||||
|
|
||||||
for (final Object o : master.selectNodes(".//*[local-name()='link']")) {
|
|
||||||
final String rel = ((Element) o).valueOf("@*[local-name()='rel']");
|
|
||||||
final String href = ((Element) o).valueOf("@*[local-name()='href']");
|
|
||||||
|
|
||||||
if (relType.equals(rel) && StringUtils.isNotBlank(href)) {
|
|
||||||
final String cacheKey = relType + "#" + href;
|
|
||||||
if (this.cache.containsKey(cacheKey)) {
|
|
||||||
try {
|
|
||||||
log.debug(" * from cache (" + relType + "): " + href);
|
|
||||||
newRoot.add(DocumentHelper.parseText(this.cache.get(cacheKey)).getRootElement());
|
|
||||||
} catch (final DocumentException e) {
|
|
||||||
log.error("Error retrieving cache element: " + cacheKey, e);
|
|
||||||
throw new RuntimeException("Error retrieving cache element: " + cacheKey, e);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
final Document doc = loadURL(cleanURL(href), 0);
|
|
||||||
final Element elem = mapper.apply(doc);
|
|
||||||
newRoot.add(elem);
|
|
||||||
this.cache.put(cacheKey, elem.asXML());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean filterIncremental(final Element e) {
|
|
||||||
if (!this.incremental || isAfter(e.valueOf("@*[local-name() = 'created']"), this.fromDate)
|
|
||||||
|| isAfter(e.valueOf("@*[local-name() = 'updated']"), this.fromDate)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String expandMainEntity(final Element mainEntity) {
|
|
||||||
final Element newRoot = DocumentHelper.createElement("doc");
|
|
||||||
newRoot.add(mainEntity);
|
|
||||||
addLinkedEntities(mainEntity, "PROJECT", newRoot, this::asProjectElement);
|
|
||||||
return DocumentHelper.createDocument(newRoot).asXML();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Element asProjectElement(final Document doc) {
|
|
||||||
final Element newOrg = DocumentHelper.createElement("project");
|
|
||||||
newOrg.addElement("id").setText(doc.valueOf("/*/@*[local-name()='id']"));
|
|
||||||
newOrg
|
|
||||||
.addElement("code")
|
|
||||||
.setText(doc.valueOf("//*[local-name()='identifier' and @*[local-name()='type'] = 'RCUK']"));
|
|
||||||
newOrg.addElement("title").setText(doc.valueOf("//*[local-name()='title']"));
|
|
||||||
return newOrg;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String cleanURL(final String url) {
|
|
||||||
String cleaned = url;
|
|
||||||
if (cleaned.contains("gtr.gtr")) {
|
|
||||||
cleaned = cleaned.replace("gtr.gtr", "gtr");
|
|
||||||
}
|
|
||||||
if (cleaned.startsWith("http://")) {
|
|
||||||
cleaned = cleaned.replaceFirst("http://", "https://");
|
|
||||||
}
|
|
||||||
return cleaned;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Document loadURL(final String cleanUrl, final int attempt) {
|
|
||||||
try {
|
|
||||||
log.debug(" * Downloading Url: " + cleanUrl);
|
|
||||||
final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes("UTF-8");
|
|
||||||
return DocumentHelper.parseText(new String(bytes));
|
|
||||||
} catch (final Throwable e) {
|
|
||||||
log.error("Error dowloading url: " + cleanUrl + ", attempt = " + attempt, e);
|
|
||||||
if (attempt >= MAX_ATTEMPTS) {
|
|
||||||
throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
Thread.sleep(60000); // I wait for a minute
|
|
||||||
} catch (final InterruptedException e1) {
|
|
||||||
throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
|
|
||||||
}
|
|
||||||
return loadURL(cleanUrl, attempt + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private DateTime parseDate(final String s) {
|
|
||||||
return DateTime.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isAfter(final String d, final DateTime fromDate) {
|
|
||||||
return StringUtils.isNotBlank(d) && parseDate(d).isAfter(fromDate);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,52 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.osf;
|
|
||||||
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Spliterator;
|
|
||||||
import java.util.Spliterators;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
import java.util.stream.StreamSupport;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
|
||||||
|
|
||||||
public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
|
|
||||||
|
|
||||||
public static final int PAGE_SIZE_VALUE_DEFAULT = 100;
|
|
||||||
|
|
||||||
private final HttpClientParams clientParams;
|
|
||||||
|
|
||||||
public OsfPreprintsCollectorPlugin(final HttpClientParams clientParams) {
|
|
||||||
this.clientParams = clientParams;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
|
||||||
final String baseUrl = api.getBaseUrl();
|
|
||||||
|
|
||||||
final int pageSize = Optional
|
|
||||||
.ofNullable(api.getParams().get("pageSize"))
|
|
||||||
.filter(StringUtils::isNotBlank)
|
|
||||||
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
|
|
||||||
.orElse(PAGE_SIZE_VALUE_DEFAULT);
|
|
||||||
|
|
||||||
if (StringUtils.isBlank(baseUrl)) {
|
|
||||||
throw new CollectorException("Param 'baseUrl' is null or empty");
|
|
||||||
}
|
|
||||||
|
|
||||||
final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams());
|
|
||||||
|
|
||||||
return StreamSupport
|
|
||||||
.stream(Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
|
|
||||||
}
|
|
||||||
|
|
||||||
public HttpClientParams getClientParams() {
|
|
||||||
return this.clientParams;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,151 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.osf;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Queue;
|
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.dom4j.Document;
|
|
||||||
import org.dom4j.DocumentHelper;
|
|
||||||
import org.dom4j.Element;
|
|
||||||
import org.dom4j.Node;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpConnector2;
|
|
||||||
|
|
||||||
public class OsfPreprintsIterator implements Iterator<String> {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(OsfPreprintsIterator.class);
|
|
||||||
|
|
||||||
private static final int MAX_ATTEMPTS = 5;
|
|
||||||
|
|
||||||
private final HttpClientParams clientParams;
|
|
||||||
|
|
||||||
private final String baseUrl;
|
|
||||||
private final int pageSize;
|
|
||||||
|
|
||||||
private String currentUrl;
|
|
||||||
|
|
||||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
|
||||||
|
|
||||||
public OsfPreprintsIterator(
|
|
||||||
final String baseUrl,
|
|
||||||
final int pageSize,
|
|
||||||
final HttpClientParams clientParams) {
|
|
||||||
|
|
||||||
this.clientParams = clientParams;
|
|
||||||
this.baseUrl = baseUrl;
|
|
||||||
this.pageSize = pageSize;
|
|
||||||
|
|
||||||
initQueue();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initQueue() {
|
|
||||||
this.currentUrl = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize;
|
|
||||||
|
|
||||||
log.info("REST calls starting with {}", this.currentUrl);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
synchronized (this.recordQueue) {
|
|
||||||
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl)
|
|
||||||
&& this.currentUrl.startsWith("http")) {
|
|
||||||
try {
|
|
||||||
this.currentUrl = downloadPage(this.currentUrl);
|
|
||||||
} catch (final CollectorException e) {
|
|
||||||
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!this.recordQueue.isEmpty()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String next() {
|
|
||||||
synchronized (this.recordQueue) {
|
|
||||||
return this.recordQueue.poll();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String downloadPage(final String url) throws CollectorException {
|
|
||||||
|
|
||||||
final Document doc = downloadUrl(url, 0);
|
|
||||||
|
|
||||||
for (final Object o : doc.selectNodes("/*/data")) {
|
|
||||||
|
|
||||||
final Element n = (Element) ((Element) o).detach();
|
|
||||||
|
|
||||||
final Element group = DocumentHelper.createElement("group");
|
|
||||||
group.addAttribute("id", n.valueOf("./id"));
|
|
||||||
|
|
||||||
group.addElement("preprint").add(n);
|
|
||||||
|
|
||||||
for (final Object o1 : n.selectNodes(".//contributors//href")) {
|
|
||||||
final String href = ((Node) o1).getText();
|
|
||||||
if (StringUtils.isNotBlank(href) && href.startsWith("http")) {
|
|
||||||
final Document doc1 = downloadUrl(href, 0);
|
|
||||||
group.addElement("contributors").add(doc1.getRootElement().detach());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
|
|
||||||
final String href = ((Node) o1).getText();
|
|
||||||
if (StringUtils.isNotBlank(href) && href.startsWith("http")) {
|
|
||||||
final Document doc1 = downloadUrl(href, 0);
|
|
||||||
group.addElement("primary_file").add(doc1.getRootElement().detach());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this.recordQueue.add(DocumentHelper.createDocument(group).asXML());
|
|
||||||
}
|
|
||||||
|
|
||||||
return doc.valueOf("/*/links/next");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
|
|
||||||
if (attempt > MAX_ATTEMPTS) {
|
|
||||||
throw new CollectorException("Max Number of attempts reached, url:" + url);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (attempt > 0) {
|
|
||||||
final int delay = (attempt * 5000);
|
|
||||||
log.debug("Attempt {} with delay {}", attempt, delay);
|
|
||||||
try {
|
|
||||||
Thread.sleep(delay);
|
|
||||||
} catch (final InterruptedException e) {
|
|
||||||
new CollectorException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
log.info("requesting URL [{}]", url);
|
|
||||||
|
|
||||||
final HttpConnector2 connector = new HttpConnector2(this.clientParams);
|
|
||||||
|
|
||||||
final String json = connector.getInputSource(url);
|
|
||||||
final String xml = JsonUtils.convertToXML(json);
|
|
||||||
|
|
||||||
return DocumentHelper.parseText(xml);
|
|
||||||
|
|
||||||
} catch (final Throwable e) {
|
|
||||||
log.warn(e.getMessage(), e);
|
|
||||||
if ((e instanceof CollectorException) && e.getMessage().contains("401")) {
|
|
||||||
final Element root = DocumentHelper.createElement("error_401_authorization_required");
|
|
||||||
return DocumentHelper.createDocument(root);
|
|
||||||
}
|
|
||||||
return downloadUrl(url, attempt + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -28,19 +28,13 @@
|
||||||
"paramLongName": "dataciteInputPath",
|
"paramLongName": "dataciteInputPath",
|
||||||
"paramDescription": "the path to get the input data from Datacite",
|
"paramDescription": "the path to get the input data from Datacite",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},{
|
||||||
{
|
|
||||||
"paramName": "wip",
|
"paramName": "wip",
|
||||||
"paramLongName": "webCrawlInputPath",
|
"paramLongName": "webCrawlInputPath",
|
||||||
"paramDescription": "the path to get the input data from Web Crawl",
|
"paramDescription": "the path to get the input data from Web Crawl",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
}
|
||||||
{
|
,
|
||||||
"paramName": "pub",
|
|
||||||
"paramLongName": "publisherInputPath",
|
|
||||||
"paramDescription": "the path to get the input data from publishers",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
|
|
|
@ -35,6 +35,5 @@ crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
||||||
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
||||||
openapcInputPath=/data/bip-affiliations/openapc-data.json
|
openapcInputPath=/data/bip-affiliations/openapc-data.json
|
||||||
dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
||||||
webCrawlInputPath=/data/bip-affiliations/webCrawl/
|
|
||||||
|
|
||||||
outputPath=/tmp/crossref-affiliations-output-v5
|
outputPath=/tmp/crossref-affiliations-output-v5
|
||||||
|
|
|
@ -21,10 +21,6 @@
|
||||||
<name>webCrawlInputPath</name>
|
<name>webCrawlInputPath</name>
|
||||||
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>publisherInputPath</name>
|
|
||||||
<description>the path where to find the inferred affiliation relations from publisher websites</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>the path where to store the actionset</description>
|
<description>the path where to store the actionset</description>
|
||||||
|
@ -121,7 +117,6 @@
|
||||||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||||
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
||||||
<arg>--publisherInputPath</arg><arg>${publisherInputPath}</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -24,19 +24,12 @@
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
}, {
|
||||||
{
|
|
||||||
"paramName": "nn",
|
"paramName": "nn",
|
||||||
"paramLongName": "hdfsNameNode",
|
"paramLongName": "hdfsNameNode",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
}
|
||||||
{
|
|
||||||
"paramName": "bp",
|
|
||||||
"paramLongName": "backupPath",
|
|
||||||
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
|
||||||
"paramRequired": true
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -129,7 +129,6 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||||
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
||||||
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
|
||||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
|
@ -16,11 +16,10 @@
|
||||||
"paramLongName": "isSparkSessionManged",
|
"paramLongName": "isSparkSessionManged",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},{
|
||||||
{
|
|
||||||
"paramName": "nn",
|
"paramName": "nn",
|
||||||
"paramLongName": "nameNode",
|
"paramLongName": "nameNode",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
|
|
||||||
<decision name="resume_from">
|
<decision name="resume_from">
|
||||||
<switch>
|
<switch>
|
||||||
<case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||||
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
@ -33,14 +33,6 @@
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="reset_workingDir">
|
|
||||||
<fs>
|
|
||||||
<delete path="${workingDir}"/>
|
|
||||||
<mkdir path="${workingDir}"/>
|
|
||||||
</fs>
|
|
||||||
<ok to="download"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<action name="download">
|
<action name="download">
|
||||||
<shell xmlns="uri:oozie:shell-action:0.2">
|
<shell xmlns="uri:oozie:shell-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
|
|
@ -1,11 +1,3 @@
|
||||||
#PROPERTIES TO CREATE THE ACTION SET
|
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
||||||
#sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
outputPath=/tmp/miriam/webcrawlComplete/
|
||||||
#outputPath=/tmp/miriam/webcrawlComplete/
|
blackListPath=/user/miriam.baglioni/openalex-blackList
|
||||||
#blackListPath=/user/miriam.baglioni/openalex-blackList
|
|
||||||
#resumeFrom=create
|
|
||||||
|
|
||||||
#PROPERTIES TO REMOVE FROM THE ACTION SET
|
|
||||||
sourcePath=/var/lib/dnet/actionManager_PROD/webcrawl/rawset_28247629-468b-478e-9a42-bc540877125d_1718121542061/
|
|
||||||
outputPath=/tmp/miriam/webcrawlRemoved/
|
|
||||||
blackListPath=/user/miriam.baglioni/oalexBlackListNormalized
|
|
||||||
resumeFrom=remove
|
|
||||||
|
|
|
@ -20,19 +20,12 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="resumeFrom"/>
|
<start to="create_actionset"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<decision name="resumeFrom">
|
|
||||||
<switch>
|
|
||||||
<case to="create_actionset">${wf:conf('resumeFrom') eq 'create'}</case>
|
|
||||||
<default to="remove_from_actionset"/>
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="create_actionset">
|
<action name="create_actionset">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -57,30 +50,5 @@
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="remove_from_actionset">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Removes some relations found to be wrong from the AS</name>
|
|
||||||
<class>eu.dnetlib.dhp.actionmanager.webcrawl.RemoveRelationFromActionSet</class>
|
|
||||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
|
||||||
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -1,51 +1,41 @@
|
||||||
<RESOURCE_PROFILE>
|
<RESOURCE_PROFILE>
|
||||||
<HEADER>
|
<HEADER>
|
||||||
<RESOURCE_IDENTIFIER
|
<RESOURCE_IDENTIFIER value="2ad0cdd9-c96c-484c-8b0e-ed56d86891fe_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU=" />
|
||||||
value="2ad0cdd9-c96c-484c-8b0e-ed56d86891fe_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
|
<RESOURCE_TYPE value="TransformationRuleDSResourceType" />
|
||||||
<RESOURCE_TYPE value="TransformationRuleDSResourceType"/>
|
<RESOURCE_KIND value="TransformationRuleDSResources" />
|
||||||
<RESOURCE_KIND value="TransformationRuleDSResources"/>
|
<RESOURCE_URI value="" />
|
||||||
<RESOURCE_URI value=""/>
|
<DATE_OF_CREATION value="2024-03-05T11:23:00+00:00" />
|
||||||
<DATE_OF_CREATION value="2024-03-05T11:23:00+00:00"/>
|
|
||||||
</HEADER>
|
</HEADER>
|
||||||
<BODY>
|
<BODY>
|
||||||
<CONFIGURATION>
|
<CONFIGURATION>
|
||||||
<SOURCE_METADATA_FORMAT interpretation="cleaned" layout="store" name="dc"/>
|
<SOURCE_METADATA_FORMAT interpretation="cleaned" layout="store" name="dc" />
|
||||||
<SINK_METADATA_FORMAT name="odf_hbase"/>
|
<SINK_METADATA_FORMAT name="odf_hbase" />
|
||||||
<IMPORTED/>
|
<IMPORTED />
|
||||||
<SCRIPT>
|
<SCRIPT>
|
||||||
<TITLE>xslt_base2odf_hadoop</TITLE>
|
<TITLE>xslt_base2odf_hadoop</TITLE>
|
||||||
<CODE>
|
<CODE>
|
||||||
<xsl:stylesheet xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
|
<xsl:stylesheet xmlns:oaire="http://namespace.openaire.eu/schema/oaire/" xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO" xmlns:base_dc="http://oai.base-search.net/base_dc/"
|
||||||
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
|
xmlns:datacite="http://datacite.org/schema/kernel-4" xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
xmlns:base_dc="http://oai.base-search.net/base_dc/"
|
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:vocabulary="http://eu/dnetlib/transform/clean" xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||||
xmlns:datacite="http://datacite.org/schema/kernel-4"
|
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
||||||
xmlns:vocabulary="http://eu/dnetlib/transform/clean"
|
|
||||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
|
||||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
|
||||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
|
||||||
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
||||||
exclude-result-prefixes="xsl vocabulary dateCleaner base_dc" version="2.0">
|
exclude-result-prefixes="xsl vocabulary dateCleaner base_dc" version="2.0">
|
||||||
<xsl:param name="varOfficialName"/>
|
<xsl:param name="varOfficialName" />
|
||||||
<xsl:param name="varDataSourceId"/>
|
<xsl:param name="varDataSourceId" />
|
||||||
<xsl:param name="varFP7" select="'corda_______::'"/>
|
<xsl:param name="varFP7" select="'corda_______::'" />
|
||||||
<xsl:param name="varH2020" select="'corda__h2020::'"/>
|
<xsl:param name="varH2020" select="'corda__h2020::'" />
|
||||||
<xsl:param name="repoCode"
|
<xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')" />
|
||||||
select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
|
<xsl:param name="index" select="0" />
|
||||||
<xsl:param name="index" select="0"/>
|
<xsl:param name="transDate" select="current-dateTime()" />
|
||||||
<xsl:param name="transDate" select="current-dateTime()"/>
|
|
||||||
|
|
||||||
<xsl:template name="terminate">
|
<xsl:template name="terminate">
|
||||||
<xsl:message terminate="yes"> record is not compliant, transformation is
|
<xsl:message terminate="yes">
|
||||||
interrupted. </xsl:message>
|
record is not compliant, transformation is interrupted.
|
||||||
|
</xsl:message>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="/">
|
<xsl:template match="/">
|
||||||
<record>
|
<record>
|
||||||
<xsl:apply-templates select="//*[local-name() = 'header']"/>
|
<xsl:apply-templates select="//*[local-name() = 'header']" />
|
||||||
|
|
||||||
|
|
||||||
<!-- NOT USED
|
<!-- NOT USED
|
||||||
|
@ -66,7 +56,7 @@
|
||||||
|
|
||||||
<xsl:for-each select="//base_dc:doi">
|
<xsl:for-each select="//base_dc:doi">
|
||||||
<datacite:identifier identifierType="DOI">
|
<datacite:identifier identifierType="DOI">
|
||||||
<xsl:value-of select="."/>
|
<xsl:value-of select="." />
|
||||||
</datacite:identifier>
|
</datacite:identifier>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
|
@ -74,67 +64,55 @@
|
||||||
<xsl:for-each
|
<xsl:for-each
|
||||||
select="distinct-values(//dc:identifier[starts-with(., 'http') and (not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
|
select="distinct-values(//dc:identifier[starts-with(., 'http') and (not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
|
||||||
<datacite:identifier alternateIdentifierType="url">
|
<datacite:identifier alternateIdentifierType="url">
|
||||||
<xsl:value-of select="."/>
|
<xsl:value-of select="." />
|
||||||
</datacite:identifier>
|
</datacite:identifier>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<xsl:for-each
|
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
|
||||||
select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
|
<datacite:identifier alternateIdentifierType="handle">
|
||||||
<datacite:identifier
|
<xsl:value-of select="." />
|
||||||
alternateIdentifierType="handle">
|
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</datacite:identifier>
|
</datacite:identifier>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<xsl:for-each
|
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
|
||||||
select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
|
<datacite:identifier alternateIdentifierType='urn'>
|
||||||
<datacite:identifier alternateIdentifierType="urn">
|
<xsl:value-of select="." />
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</datacite:identifier>
|
</datacite:identifier>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<datacite:identifier
|
<datacite:identifier alternateIdentifierType="oai-original">
|
||||||
alternateIdentifierType="oai-original">
|
<xsl:value-of
|
||||||
<xsl:value-of select="//oai:header/oai:identifier"/>
|
select="//oai:header/oai:identifier" />
|
||||||
</datacite:identifier>
|
</datacite:identifier>
|
||||||
|
|
||||||
</datacite:alternateIdentifiers>
|
</datacite:alternateIdentifiers>
|
||||||
|
|
||||||
<datacite:relatedIdentifiers/>
|
<datacite:relatedIdentifiers />
|
||||||
|
|
||||||
|
|
||||||
<xsl:for-each select="//base_dc:typenorm">
|
<xsl:for-each select="//base_dc:typenorm">
|
||||||
<datacite:resourceType>
|
<datacite:resourceType><xsl:value-of select="vocabulary:clean(., 'base:normalized_types')" /></datacite:resourceType>
|
||||||
<xsl:value-of
|
|
||||||
select="vocabulary:clean(., 'base:normalized_types')"
|
|
||||||
/>
|
|
||||||
</datacite:resourceType>
|
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<datacite:titles>
|
<datacite:titles>
|
||||||
<xsl:for-each select="//dc:title">
|
<xsl:for-each select="//dc:title">
|
||||||
<datacite:title>
|
<datacite:title>
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</datacite:title>
|
</datacite:title>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</datacite:titles>
|
</datacite:titles>
|
||||||
|
|
||||||
<datacite:creators>
|
<datacite:creators>
|
||||||
<xsl:for-each select="//dc:creator">
|
<xsl:for-each select="//dc:creator">
|
||||||
<xsl:variable name="author"
|
<xsl:variable name="author" select="normalize-space(.)" />
|
||||||
select="normalize-space(.)"/>
|
|
||||||
<datacite:creator>
|
<datacite:creator>
|
||||||
<datacite:creatorName>
|
<datacite:creatorName>
|
||||||
<xsl:value-of select="$author"/>
|
<xsl:value-of select="$author" />
|
||||||
</datacite:creatorName>
|
</datacite:creatorName>
|
||||||
<xsl:for-each
|
<xsl:for-each select="//base_dc:authod_id[normalize-space(./base_dc:creator_name) = $author]/base_dc:creator_id ">
|
||||||
select="//base_dc:authod_id[normalize-space(./base_dc:creator_name) = $author]/base_dc:creator_id ">
|
|
||||||
<xsl:if test="contains(.,'https://orcid.org/')">
|
<xsl:if test="contains(.,'https://orcid.org/')">
|
||||||
<nameIdentifier schemeURI="https://orcid.org/"
|
<nameIdentifier schemeURI="https://orcid.org/" nameIdentifierScheme="ORCID">
|
||||||
nameIdentifierScheme="ORCID">
|
<xsl:value-of select="substring-after(., 'https://orcid.org/')" />
|
||||||
<xsl:value-of
|
|
||||||
select="substring-after(., 'https://orcid.org/')"
|
|
||||||
/>
|
|
||||||
</nameIdentifier>
|
</nameIdentifier>
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
@ -146,7 +124,7 @@
|
||||||
<xsl:for-each select="//dc:contributor">
|
<xsl:for-each select="//dc:contributor">
|
||||||
<datacite:contributor>
|
<datacite:contributor>
|
||||||
<datacite:contributorName>
|
<datacite:contributorName>
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</datacite:contributorName>
|
</datacite:contributorName>
|
||||||
</datacite:contributor>
|
</datacite:contributor>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
@ -155,7 +133,7 @@
|
||||||
<datacite:descriptions>
|
<datacite:descriptions>
|
||||||
<xsl:for-each select="//dc:description">
|
<xsl:for-each select="//dc:description">
|
||||||
<datacite:description descriptionType="Abstract">
|
<datacite:description descriptionType="Abstract">
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</datacite:description>
|
</datacite:description>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</datacite:descriptions>
|
</datacite:descriptions>
|
||||||
|
@ -163,47 +141,43 @@
|
||||||
<datacite:subjects>
|
<datacite:subjects>
|
||||||
<xsl:for-each select="//dc:subject">
|
<xsl:for-each select="//dc:subject">
|
||||||
<datacite:subject>
|
<datacite:subject>
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</datacite:subject>
|
</datacite:subject>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<xsl:for-each
|
<xsl:for-each select="//base_dc:classcode|//base_dc:autoclasscode">
|
||||||
select="//base_dc:classcode|//base_dc:autoclasscode">
|
<datacite:subject subjectScheme="{@type}" classificationCode="{normalize-space(.)}">
|
||||||
<datacite:subject subjectScheme="{@type}"
|
|
||||||
classificationCode="{normalize-space(.)}">
|
|
||||||
<!-- TODO the value should be obtained by the Code -->
|
<!-- TODO the value should be obtained by the Code -->
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</datacite:subject>
|
</datacite:subject>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</datacite:subjects>
|
</datacite:subjects>
|
||||||
|
|
||||||
<xsl:for-each select="//dc:publisher">
|
<xsl:for-each select="//dc:publisher">
|
||||||
<datacite:publisher>
|
<datacite:publisher>
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</datacite:publisher>
|
</datacite:publisher>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<xsl:for-each select="//base_dc:year">
|
<xsl:for-each select="//base_dc:year">
|
||||||
<datacite:publicationYear>
|
<datacite:publicationYear>
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</datacite:publicationYear>
|
</datacite:publicationYear>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<datacite:formats>
|
<datacite:formats>
|
||||||
<xsl:for-each select="//dc:format">
|
<xsl:for-each select="//dc:format">
|
||||||
<datacite:format>
|
<datacite:format>
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</datacite:format>
|
</datacite:format>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</datacite:formats>
|
</datacite:formats>
|
||||||
|
|
||||||
<datacite:language>
|
<datacite:language>
|
||||||
<xsl:value-of
|
<xsl:value-of select="vocabulary:clean( //base_dc:lang, 'dnet:languages')" />
|
||||||
select="vocabulary:clean( //base_dc:lang, 'dnet:languages')"
|
|
||||||
/>
|
|
||||||
</datacite:language>
|
</datacite:language>
|
||||||
|
|
||||||
<!--<datacite:rightsList>
|
<oaf:accessrights>
|
||||||
<xsl:if test="//base_dc:oa[.='0']">
|
<xsl:if test="//base_dc:oa[.='0']">
|
||||||
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_16ec">restricted access</datacite:rights>
|
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_16ec">restricted access</datacite:rights>
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
|
@ -211,29 +185,21 @@
|
||||||
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
|
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
<xsl:for-each select="//dc:rights|//base_dc:rightsnorm">
|
<xsl:for-each select="//dc:rights|//base_dc:rightsnorm">
|
||||||
<datacite:rights>
|
<datacite:rights><xsl:value-of select="vocabulary:clean(., 'dnet:access_modes')" /></datacite:rights>
|
||||||
<xsl:value-of select="vocabulary:clean(., 'dnet:access_modes')"/>
|
|
||||||
</datacite:rights>
|
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</datacite:rightsList>-->
|
</oaf:accessrights>
|
||||||
|
|
||||||
</datacite:resource>
|
</datacite:resource>
|
||||||
|
|
||||||
<xsl:for-each select="//dc:relation">
|
<xsl:for-each select="//dc:relation">
|
||||||
<xsl:if
|
<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')">
|
||||||
test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')">
|
|
||||||
<oaf:projectid>
|
<oaf:projectid>
|
||||||
<xsl:value-of
|
<xsl:value-of select="concat($varFP7, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
|
||||||
select="concat($varFP7, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))"
|
|
||||||
/>
|
|
||||||
</oaf:projectid>
|
</oaf:projectid>
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
<xsl:if
|
<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')">
|
||||||
test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')">
|
|
||||||
<oaf:projectid>
|
<oaf:projectid>
|
||||||
<xsl:value-of
|
<xsl:value-of select="concat($varH2020, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
|
||||||
select="concat($varH2020, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))"
|
|
||||||
/>
|
|
||||||
</oaf:projectid>
|
</oaf:projectid>
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
@ -243,81 +209,68 @@
|
||||||
|
|
||||||
<!-- Book part -->
|
<!-- Book part -->
|
||||||
<xsl:when test="//base_dc:typenorm = '111'">
|
<xsl:when test="//base_dc:typenorm = '111'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0013</dr:CobjCategory>
|
||||||
>0013</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Book -->
|
<!-- Book -->
|
||||||
<xsl:when test="//base_dc:typenorm = '11'">
|
<xsl:when test="//base_dc:typenorm = '11'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0002</dr:CobjCategory>
|
||||||
>0002</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Article contribution -->
|
<!-- Article contribution -->
|
||||||
<xsl:when test="//base_dc:typenorm = '121'">
|
<xsl:when test="//base_dc:typenorm = '121'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
|
||||||
>0001</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
|
|
||||||
<!-- Journal/Newspaper -->
|
<!-- Journal/Newspaper -->
|
||||||
<xsl:when test="//base_dc:typenorm = '12'">
|
<xsl:when test="//base_dc:typenorm = '12'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0043</dr:CobjCategory>
|
||||||
>0043</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Report -->
|
<!-- Report -->
|
||||||
<xsl:when test="//base_dc:typenorm = '14'">
|
<xsl:when test="//base_dc:typenorm = '14'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0017</dr:CobjCategory>
|
||||||
>0017</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Review -->
|
<!-- Review -->
|
||||||
<xsl:when test="//base_dc:typenorm = '15'">
|
<xsl:when test="//base_dc:typenorm = '15'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0015</dr:CobjCategory>
|
||||||
>0015</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Lecture -->
|
<!-- Lecture -->
|
||||||
<xsl:when test="//base_dc:typenorm = '17'">
|
<xsl:when test="//base_dc:typenorm = '17'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0010</dr:CobjCategory>
|
||||||
>0010</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Bachelor's thesis -->
|
<!-- Bachelor's thesis -->
|
||||||
<xsl:when test="//base_dc:typenorm = '181'">
|
<xsl:when test="//base_dc:typenorm = '181'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0008</dr:CobjCategory>
|
||||||
>0008</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Master's thesis -->
|
<!-- Master's thesis -->
|
||||||
<xsl:when test="//base_dc:typenorm = '182'">
|
<xsl:when test="//base_dc:typenorm = '182'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0007</dr:CobjCategory>
|
||||||
>0007</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Doctoral and postdoctoral thesis -->
|
<!-- Doctoral and postdoctoral thesis -->
|
||||||
<xsl:when test="//base_dc:typenorm = '183'">
|
<xsl:when test="//base_dc:typenorm = '183'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0006</dr:CobjCategory>
|
||||||
>0006</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Thesis -->
|
<!-- Thesis -->
|
||||||
<xsl:when test="//base_dc:typenorm = '18'">
|
<xsl:when test="//base_dc:typenorm = '18'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0044</dr:CobjCategory>
|
||||||
>0044</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Patent -->
|
<!-- Patent -->
|
||||||
<xsl:when test="//base_dc:typenorm = '1A'">
|
<xsl:when test="//base_dc:typenorm = '1A'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0019</dr:CobjCategory>
|
||||||
>0019</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Text -->
|
<!-- Text -->
|
||||||
<xsl:when test="//base_dc:typenorm = '1'">
|
<xsl:when test="//base_dc:typenorm = '1'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
|
||||||
>0001</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Software -->
|
<!-- Software -->
|
||||||
|
@ -362,26 +315,22 @@
|
||||||
|
|
||||||
<!-- Other non-article -->
|
<!-- Other non-article -->
|
||||||
<xsl:when test="//base_dc:typenorm = '122'">
|
<xsl:when test="//base_dc:typenorm = '122'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
|
||||||
>0038</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Course material -->
|
<!-- Course material -->
|
||||||
<xsl:when test="//base_dc:typenorm = '16'">
|
<xsl:when test="//base_dc:typenorm = '16'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
|
||||||
>0038</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Manuscript -->
|
<!-- Manuscript -->
|
||||||
<xsl:when test="//base_dc:typenorm = '19'">
|
<xsl:when test="//base_dc:typenorm = '19'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
|
||||||
>0038</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Conference object -->
|
<!-- Conference object -->
|
||||||
<xsl:when test="//base_dc:typenorm = '13'">
|
<xsl:when test="//base_dc:typenorm = '13'">
|
||||||
<dr:CobjCategory type="publication"
|
<dr:CobjCategory type="publication">0004</dr:CobjCategory>
|
||||||
>0004</dr:CobjCategory>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
|
|
||||||
<!-- Unknown -->
|
<!-- Unknown -->
|
||||||
|
@ -399,100 +348,83 @@
|
||||||
<xsl:when test="//base_dc:oa[.='1']">OPEN</xsl:when>
|
<xsl:when test="//base_dc:oa[.='1']">OPEN</xsl:when>
|
||||||
<xsl:when test="//base_dc:oa[.='2']">UNKNOWN</xsl:when>
|
<xsl:when test="//base_dc:oa[.='2']">UNKNOWN</xsl:when>
|
||||||
<xsl:when test="//base_dc:rightsnorm">
|
<xsl:when test="//base_dc:rightsnorm">
|
||||||
<xsl:value-of
|
<xsl:value-of select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:access_modes')" />
|
||||||
select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:access_modes')"
|
|
||||||
/>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
<xsl:when test="//dc:rights">
|
<xsl:when test="//dc:rights">
|
||||||
<xsl:value-of
|
<xsl:value-of select="vocabulary:clean( //dc:rights, 'dnet:access_modes')" />
|
||||||
select="vocabulary:clean( //dc:rights, 'dnet:access_modes')"
|
|
||||||
/>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
<xsl:otherwise>UNKNOWN</xsl:otherwise>
|
<xsl:otherwise>UNKNOWN</xsl:otherwise>
|
||||||
</xsl:choose>
|
</xsl:choose>
|
||||||
</oaf:accessrights>
|
</oaf:accessrights>
|
||||||
|
|
||||||
<xsl:if test="//base_dc:rightsnorm and not(contains(//base_dc:rightsnorm, ';'))">
|
|
||||||
<oaf:license><xsl:value-of select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:licenses')" /></oaf:license>
|
|
||||||
</xsl:if>
|
|
||||||
|
|
||||||
<xsl:for-each select="//base_dc:doi">
|
<xsl:for-each select="//base_dc:doi">
|
||||||
<oaf:identifier identifierType="doi">
|
<oaf:identifier identifierType="doi">
|
||||||
<xsl:value-of select="."/>
|
<xsl:value-of select="." />
|
||||||
</oaf:identifier>
|
</oaf:identifier>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<xsl:for-each
|
<xsl:for-each
|
||||||
select="distinct-values(//dc:identifier[starts-with(., 'http') and ( not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
|
select="distinct-values(//dc:identifier[starts-with(., 'http') and ( not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
|
||||||
<oaf:identifier identifierType="url">
|
<oaf:identifier identifierType="url">
|
||||||
<xsl:value-of select="."/>
|
<xsl:value-of select="." />
|
||||||
</oaf:identifier>
|
</oaf:identifier>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<xsl:for-each
|
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
|
||||||
select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
|
|
||||||
<oaf:identifier identifierType="handle">
|
<oaf:identifier identifierType="handle">
|
||||||
<xsl:value-of select="."/>
|
<xsl:value-of select="." />
|
||||||
</oaf:identifier>
|
</oaf:identifier>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<xsl:for-each
|
<xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
|
||||||
select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
|
<oaf:identifier identifierType='urn'>
|
||||||
<oaf:identifier identifierType="urn">
|
<xsl:value-of select="." />
|
||||||
<xsl:value-of select="."/>
|
|
||||||
</oaf:identifier>
|
</oaf:identifier>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
|
|
||||||
<oaf:identifier identifierType="oai-original">
|
<oaf:identifier identifierType="oai-original">
|
||||||
<xsl:value-of select="//oai:header/oai:identifier"/>
|
<xsl:value-of
|
||||||
|
select="//oai:header/oai:identifier" />
|
||||||
</oaf:identifier>
|
</oaf:identifier>
|
||||||
|
|
||||||
<oaf:hostedBy>
|
<oaf:hostedBy>
|
||||||
<xsl:attribute name="name">
|
<xsl:attribute name="name">
|
||||||
<xsl:value-of select="//base_dc:collname"/>
|
<xsl:value-of select="//base_dc:collname" />
|
||||||
</xsl:attribute>
|
</xsl:attribute>
|
||||||
<xsl:attribute name="id">
|
<xsl:attribute name="id">
|
||||||
<xsl:value-of
|
<xsl:value-of select="concat('opendoar____::', //base_dc:collection/@opendoar_id)" />
|
||||||
select="concat('opendoar____::', //base_dc:collection/@opendoar_id)"
|
|
||||||
/>
|
|
||||||
</xsl:attribute>
|
</xsl:attribute>
|
||||||
</oaf:hostedBy>
|
</oaf:hostedBy>
|
||||||
|
|
||||||
<oaf:collectedFrom>
|
<oaf:collectedFrom>
|
||||||
<xsl:attribute name="name">
|
<xsl:attribute name="name">
|
||||||
<xsl:value-of select="$varOfficialName"/>
|
<xsl:value-of select="$varOfficialName" />
|
||||||
</xsl:attribute>
|
</xsl:attribute>
|
||||||
<xsl:attribute name="id">
|
<xsl:attribute name="id">
|
||||||
<xsl:value-of select="$varDataSourceId"/>
|
<xsl:value-of select="$varDataSourceId" />
|
||||||
</xsl:attribute>
|
</xsl:attribute>
|
||||||
</oaf:collectedFrom>
|
</oaf:collectedFrom>
|
||||||
|
|
||||||
<oaf:dateAccepted>
|
<oaf:dateAccepted>
|
||||||
<xsl:value-of select="dateCleaner:dateISO( //dc:date[1] )"/>
|
<xsl:value-of select="dateCleaner:dateISO( //dc:date[1] )" />
|
||||||
</oaf:dateAccepted>
|
</oaf:dateAccepted>
|
||||||
|
|
||||||
<xsl:if test="//base_dc:oa[.='1']">
|
<xsl:if test="//base_dc:oa[.='1']">
|
||||||
<xsl:for-each select="//dc:relation[starts-with(., 'http')]">
|
<xsl:for-each select="//dc:relation[starts-with(., 'http')]">
|
||||||
<oaf:fulltext>
|
<oaf:fulltext>
|
||||||
<xsl:value-of select="normalize-space(.)"/>
|
<xsl:value-of select="normalize-space(.)" />
|
||||||
</oaf:fulltext>
|
</oaf:fulltext>
|
||||||
</xsl:for-each>
|
</xsl:for-each>
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
|
|
||||||
<xsl:for-each select="//base_dc:collection/@ror_id">
|
<xsl:for-each select="//base_dc:collection/@ror_id">
|
||||||
<oaf:relation relType="resultOrganization"
|
<oaf:relation relType="resultOrganization" subRelType="affiliation" relClass="hasAuthorInstitution" targetType="organization">
|
||||||
subRelType="affiliation" relClass="hasAuthorInstitution"
|
|
||||||
targetType="organization">
|
|
||||||
<xsl:choose>
|
<xsl:choose>
|
||||||
<xsl:when test="contains(.,'https://ror.org/')">
|
<xsl:when test="contains(.,'https://ror.org/')">
|
||||||
<xsl:value-of
|
<xsl:value-of select="concat('ror_________::', normalize-space(.))" />
|
||||||
select="concat('ror_________::', normalize-space(.))"
|
|
||||||
/>
|
|
||||||
</xsl:when>
|
</xsl:when>
|
||||||
<xsl:otherwise>
|
<xsl:otherwise>
|
||||||
<xsl:value-of
|
<xsl:value-of select="concat('ror_________::https://ror.org/', normalize-space(.))" />
|
||||||
select="concat('ror_________::https://ror.org/', normalize-space(.))"
|
|
||||||
/>
|
|
||||||
</xsl:otherwise>
|
</xsl:otherwise>
|
||||||
</xsl:choose>
|
</xsl:choose>
|
||||||
</oaf:relation>
|
</oaf:relation>
|
||||||
|
@ -503,39 +435,38 @@
|
||||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||||
<oaf:trust>0.89</oaf:trust>
|
<oaf:trust>0.89</oaf:trust>
|
||||||
<oaf:inferenceprovenance/>
|
<oaf:inferenceprovenance/>
|
||||||
<oaf:provenanceaction
|
<oaf:provenanceaction classid="sysimport:crosswalk:aggregator"
|
||||||
classid="sysimport:crosswalk:aggregator"
|
|
||||||
classname="sysimport:crosswalk:aggregator"
|
classname="sysimport:crosswalk:aggregator"
|
||||||
schemeid="dnet:provenanceActions"
|
schemeid="dnet:provenanceActions"
|
||||||
schemename="dnet:provenanceActions"/>
|
schemename="dnet:provenanceActions"/>
|
||||||
</oaf:datainfo>
|
</oaf:datainfo>
|
||||||
</metadata>
|
</metadata>
|
||||||
<xsl:copy-of select="//*[local-name() = 'about']"/>
|
<xsl:copy-of select="//*[local-name() = 'about']" />
|
||||||
</record>
|
</record>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="//*[local-name() = 'header']">
|
<xsl:template match="//*[local-name() = 'header']">
|
||||||
<xsl:if test="//oai:header/@status='deleted'">
|
<xsl:if test="//oai:header/@status='deleted'">
|
||||||
<xsl:call-template name="terminate"/>
|
<xsl:call-template name="terminate" />
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
<xsl:copy>
|
<xsl:copy>
|
||||||
<xsl:apply-templates select="node()|@*"/>
|
<xsl:apply-templates select="node()|@*" />
|
||||||
<xsl:element name="dr:dateOfTransformation">
|
<xsl:element name="dr:dateOfTransformation">
|
||||||
<xsl:value-of select="$transDate"/>
|
<xsl:value-of select="$transDate" />
|
||||||
</xsl:element>
|
</xsl:element>
|
||||||
</xsl:copy>
|
</xsl:copy>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="node()|@*">
|
<xsl:template match="node()|@*">
|
||||||
<xsl:copy>
|
<xsl:copy>
|
||||||
<xsl:apply-templates select="node()|@*"/>
|
<xsl:apply-templates select="node()|@*" />
|
||||||
</xsl:copy>
|
</xsl:copy>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
</xsl:stylesheet>
|
</xsl:stylesheet>
|
||||||
</CODE>
|
</CODE>
|
||||||
</SCRIPT>
|
</SCRIPT>
|
||||||
</CONFIGURATION>
|
</CONFIGURATION>
|
||||||
<STATUS/>
|
<STATUS />
|
||||||
<SECURITY_PARAMETERS/>
|
<SECURITY_PARAMETERS />
|
||||||
</BODY>
|
</BODY>
|
||||||
</RESOURCE_PROFILE>
|
</RESOURCE_PROFILE>
|
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>sourcePath</name>
|
<name>sourcePath</name>
|
||||||
|
@ -8,40 +8,19 @@
|
||||||
<name>database</name>
|
<name>database</name>
|
||||||
<description>the PDB Database Working Path</description>
|
<description>the PDB Database Working Path</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>mdStoreOutputId</name>
|
<name>targetPath</name>
|
||||||
<description>the identifier of the cleaned MDStore</description>
|
<description>the Target Working dir path</description>
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>mdStoreManagerURI</name>
|
|
||||||
<description>the path of the cleaned mdstore</description>
|
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="StartTransaction"/>
|
<start to="ConvertDB"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="StartTransaction">
|
|
||||||
<java>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
|
||||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
|
||||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
|
||||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
|
||||||
<capture-output/>
|
|
||||||
</java>
|
|
||||||
<ok to="ConvertDB"/>
|
|
||||||
<error to="RollBack"/>
|
|
||||||
</action>
|
|
||||||
<action name="ConvertDB">
|
<action name="ConvertDB">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -62,48 +41,11 @@
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
<arg>--dbPath</arg><arg>${sourcePath}</arg>
|
<arg>--dbPath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--database</arg><arg>${database}</arg>
|
<arg>--database</arg><arg>${database}</arg>
|
||||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="CommitVersion"/>
|
|
||||||
<error to="RollBack"/>
|
|
||||||
|
|
||||||
</action>
|
|
||||||
<action name="CommitVersion">
|
|
||||||
<java>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
|
||||||
<arg>--action</arg><arg>COMMIT</arg>
|
|
||||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
|
||||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="RollBack">
|
|
||||||
<java>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
|
||||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
|
||||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
|
||||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="Kill"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -2,5 +2,5 @@
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
|
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
|
||||||
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
|
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
|
||||||
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the OAF target path ", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -1,20 +1,5 @@
|
||||||
[
|
[
|
||||||
{
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
"paramName": "mt",
|
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true},
|
||||||
"paramLongName": "master",
|
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true}
|
||||||
"paramDescription": "should be local or yarn",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "s",
|
|
||||||
"paramLongName": "sourcePath",
|
|
||||||
"paramDescription": "the source Path",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "mo",
|
|
||||||
"paramLongName": "mdstoreOutputVersion",
|
|
||||||
"paramDescription": "the oaf path ",
|
|
||||||
"paramRequired": true
|
|
||||||
}
|
|
||||||
]
|
]
|
|
@ -9,26 +9,34 @@
|
||||||
<description>the Working Path</description>
|
<description>the Working Path</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>mdStoreOutputId</name>
|
<name>targetPath</name>
|
||||||
<description>the identifier of the cleaned MDStore</description>
|
<description>the OAF MDStore Path</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>mdStoreManagerURI</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>the path of the cleaned mdstore</description>
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>resumeFrom</name>
|
<name>resumeFrom</name>
|
||||||
<value>CreateEBIDataSet</value>
|
<value>DownloadEBILinks</value>
|
||||||
<description>node to start</description>
|
<description>node to start</description>
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="StartTransaction"/>
|
<start to="resume_from"/>
|
||||||
|
|
||||||
<decision name="resume_from">
|
<decision name="resume_from">
|
||||||
<switch>
|
<switch>
|
||||||
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
|
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
|
||||||
<case to="StartTransaction">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
||||||
<default to="DownloadEBILinks"/>
|
<default to="DownloadEBILinks"/>
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
@ -69,29 +77,9 @@
|
||||||
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
|
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
|
||||||
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
|
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="StartTransaction"/>
|
<ok to="CreateEBIDataSet"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="StartTransaction">
|
|
||||||
<java>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
|
||||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
|
||||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
|
||||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
|
||||||
<capture-output/>
|
|
||||||
</java>
|
|
||||||
<ok to="CreateEBIDataSet"/>
|
|
||||||
<error to="RollBack"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
|
|
||||||
<action name="CreateEBIDataSet">
|
<action name="CreateEBIDataSet">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
|
@ -107,49 +95,11 @@
|
||||||
${sparkExtraOPT}
|
${sparkExtraOPT}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
|
||||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
<action name="CommitVersion">
|
|
||||||
<java>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
|
||||||
<arg>--action</arg><arg>COMMIT</arg>
|
|
||||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
|
||||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="RollBack">
|
|
||||||
<java>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
|
||||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
|
||||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
|
||||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="Kill"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -332,7 +332,7 @@ case object Crossref2Oaf {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
//MAPPING Crossref DOI into PID
|
//MAPPING Crossref DOI into PID
|
||||||
val doi: String = DoiCleaningRule.clean((json \ "DOI").extract[String])
|
val doi: String = DoiCleaningRule.normalizeDoi((json \ "DOI").extract[String])
|
||||||
result.setPid(
|
result.setPid(
|
||||||
List(
|
List(
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
|
@ -504,24 +504,6 @@ case object Crossref2Oaf {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
val is_review = json \ "relation" \ "is-review-of" \ "id"
|
|
||||||
|
|
||||||
if (is_review != JNothing) {
|
|
||||||
instance.setInstancetype(
|
|
||||||
OafMapperUtils.qualifier(
|
|
||||||
"0015",
|
|
||||||
"peerReviewed",
|
|
||||||
ModelConstants.DNET_REVIEW_LEVELS,
|
|
||||||
ModelConstants.DNET_REVIEW_LEVELS
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doi.startsWith("10.3410") || doi.startsWith("10.12703"))
|
|
||||||
instance.setHostedby(
|
|
||||||
OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect")
|
|
||||||
)
|
|
||||||
|
|
||||||
instance.setAccessright(
|
instance.setAccessright(
|
||||||
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
|
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
|
||||||
)
|
)
|
||||||
|
@ -673,7 +655,7 @@ case object Crossref2Oaf {
|
||||||
val doi = input.getString(0)
|
val doi = input.getString(0)
|
||||||
val rorId = input.getString(1)
|
val rorId = input.getString(1)
|
||||||
|
|
||||||
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}"
|
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.normalizeDoi(doi)}"
|
||||||
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
||||||
|
|
||||||
val r: Relation = new Relation
|
val r: Relation = new Relation
|
||||||
|
|
|
@ -231,7 +231,7 @@ object BioDBToOAF {
|
||||||
def uniprotToOAF(input: String): List[Oaf] = {
|
def uniprotToOAF(input: String): List[Oaf] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json = parse(input)
|
lazy val json = parse(input)
|
||||||
val pid = (json \ "pid").extract[String].trim()
|
val pid = (json \ "pid").extract[String]
|
||||||
|
|
||||||
val d = new Dataset
|
val d = new Dataset
|
||||||
|
|
||||||
|
|
|
@ -2,15 +2,12 @@ package eu.dnetlib.dhp.sx.bio
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
|
||||||
|
|
||||||
object SparkTransformBioDatabaseToOAF {
|
object SparkTransformBioDatabaseToOAF {
|
||||||
|
|
||||||
|
@ -28,13 +25,8 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
|
|
||||||
val dbPath: String = parser.get("dbPath")
|
val dbPath: String = parser.get("dbPath")
|
||||||
log.info("dbPath: {}", database)
|
log.info("dbPath: {}", database)
|
||||||
|
val targetPath: String = parser.get("targetPath")
|
||||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
log.info("targetPath: {}", database)
|
||||||
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
|
||||||
|
|
||||||
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
|
||||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
|
||||||
log.info("outputBasePath: {}", outputBasePath)
|
|
||||||
|
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
|
@ -51,28 +43,24 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
case "UNIPROT" =>
|
case "UNIPROT" =>
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
||||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
targetPath
|
||||||
)
|
)
|
||||||
case "PDB" =>
|
case "PDB" =>
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
||||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
targetPath
|
||||||
)
|
)
|
||||||
case "SCHOLIX" =>
|
case "SCHOLIX" =>
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
||||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
targetPath
|
||||||
)
|
)
|
||||||
case "CROSSREF_LINKS" =>
|
case "CROSSREF_LINKS" =>
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
||||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
targetPath
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
|
||||||
val mdStoreSize = df.count
|
|
||||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,9 +9,6 @@ import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
|
||||||
|
|
||||||
object SparkEBILinksToOaf {
|
object SparkEBILinksToOaf {
|
||||||
|
|
||||||
|
@ -35,13 +32,8 @@ object SparkEBILinksToOaf {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
log.info(s"targetPath -> $targetPath")
|
||||||
|
|
||||||
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
|
||||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
|
||||||
log.info("outputBasePath: {}", outputBasePath)
|
|
||||||
|
|
||||||
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
|
|
||||||
val ebLinks: Dataset[EBILinkItem] = spark.read
|
val ebLinks: Dataset[EBILinkItem] = spark.read
|
||||||
|
@ -54,10 +46,7 @@ object SparkEBILinksToOaf {
|
||||||
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
||||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
targetPath
|
||||||
)
|
)
|
||||||
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
|
||||||
val mdStoreSize = df.count
|
|
||||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,7 +30,6 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
|
||||||
|
|
||||||
public class PrepareAffiliationRelationsTest {
|
public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
|
@ -75,20 +74,8 @@ public class PrepareAffiliationRelationsTest {
|
||||||
@Test
|
@Test
|
||||||
void testMatch() throws Exception {
|
void testMatch() throws Exception {
|
||||||
|
|
||||||
String crossrefAffiliationRelationPathNew = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
String crossrefAffiliationRelationPath = getClass()
|
String crossrefAffiliationRelationPath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror_old.json")
|
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
||||||
.getPath();
|
|
||||||
|
|
||||||
String publisherAffiliationRelationPath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
String publisherAffiliationRelationOldPath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publichers_old")
|
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
String outputPath = workingDir.toString() + "/actionSet";
|
String outputPath = workingDir.toString() + "/actionSet";
|
||||||
|
@ -97,12 +84,11 @@ public class PrepareAffiliationRelationsTest {
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
"-crossrefInputPath", crossrefAffiliationRelationPathNew,
|
"-crossrefInputPath", crossrefAffiliationRelationPath,
|
||||||
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
||||||
"-openapcInputPath", crossrefAffiliationRelationPathNew,
|
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||||
"-publisherInputPath", publisherAffiliationRelationOldPath,
|
|
||||||
"-outputPath", outputPath
|
"-outputPath", outputPath
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -113,8 +99,13 @@ public class PrepareAffiliationRelationsTest {
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
.map(aa -> ((Relation) aa.getPayload()));
|
.map(aa -> ((Relation) aa.getPayload()));
|
||||||
|
|
||||||
|
// for (Relation r : tmp.collect()) {
|
||||||
|
// System.out.println(
|
||||||
|
// r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
|
||||||
|
// );
|
||||||
|
// }
|
||||||
// count the number of relations
|
// count the number of relations
|
||||||
assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 =
|
assertEquals(120, tmp.count());
|
||||||
|
|
||||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||||
dataset.createOrReplaceTempView("result");
|
dataset.createOrReplaceTempView("result");
|
||||||
|
@ -125,7 +116,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
// verify that we have equal number of bi-directional relations
|
// verify that we have equal number of bi-directional relations
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
75, execVerification
|
60, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
@ -133,56 +124,26 @@ public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
75, execVerification
|
60, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
.size());
|
.size());
|
||||||
|
|
||||||
// check confidence value of a specific relation
|
// check confidence value of a specific relation
|
||||||
String sourceDOI = "10.1089/10872910260066679";
|
String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)";
|
||||||
|
|
||||||
final String sourceOpenaireId = ID_PREFIX
|
final String sourceOpenaireId = ID_PREFIX
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", sourceDOI));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"1.0", execVerification
|
"0.7071067812", execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"source='" + sourceOpenaireId + "'")
|
"source='" + sourceOpenaireId + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getString(4));
|
.getString(4));
|
||||||
|
|
||||||
final String publisherid = ID_PREFIX
|
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1089/10872910260066679"));
|
|
||||||
final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/05cf8a891");
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
1, execVerification
|
|
||||||
.filter(
|
|
||||||
"source = '" + ID_PREFIX
|
|
||||||
+ IdentifierFactory
|
|
||||||
.md5(PidCleaner.normalizePidValue("doi", "10.1007/s00217-010-1268-9"))
|
|
||||||
+ "' and target = '" + "20|ror_________::"
|
|
||||||
+ IdentifierFactory.md5("https://ror.org/03265fv13") + "'")
|
|
||||||
.count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
3, execVerification
|
|
||||||
.filter(
|
|
||||||
"source = '" + ID_PREFIX
|
|
||||||
+ IdentifierFactory
|
|
||||||
.md5(PidCleaner.normalizePidValue("doi", "10.1007/3-540-47984-8_14"))
|
|
||||||
+ "' and target = '" + "20|ror_________::"
|
|
||||||
+ IdentifierFactory.md5("https://ror.org/00a0n9e72") + "'")
|
|
||||||
.count());
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,6 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
|
||||||
|
|
||||||
public class CreateOpenCitationsASTest {
|
public class CreateOpenCitationsASTest {
|
||||||
|
|
||||||
|
@ -281,17 +280,17 @@ public class CreateOpenCitationsASTest {
|
||||||
@Test
|
@Test
|
||||||
void testRelationsSourceTargetCouple() throws Exception {
|
void testRelationsSourceTargetCouple() throws Exception {
|
||||||
final String doi1 = "50|doi_________::"
|
final String doi1 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||||
final String doi2 = "50|doi_________::"
|
final String doi2 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||||
final String doi3 = "50|doi_________::"
|
final String doi3 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||||
final String doi4 = "50|doi_________::"
|
final String doi4 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||||
final String doi5 = "50|doi_________::"
|
final String doi5 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||||
final String doi6 = "50|doi_________::"
|
final String doi6 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||||
|
|
||||||
String inputPath = getClass()
|
String inputPath = getClass()
|
||||||
.getResource(
|
.getResource(
|
||||||
|
|
|
@ -77,13 +77,13 @@ public class RemapTest {
|
||||||
MapOCIdsInPids
|
MapOCIdsInPids
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"--isSparkSessionManged",
|
"-isSparkSessionManged",
|
||||||
Boolean.FALSE.toString(),
|
Boolean.FALSE.toString(),
|
||||||
"--inputPath",
|
"-inputPath",
|
||||||
inputPath,
|
inputPath,
|
||||||
"--outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/out/",
|
workingDir.toString() + "/out/",
|
||||||
"--nameNode", "hdfs://localhost"
|
"-nameNode", "input1;input2;input3;input4;input5"
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,15 +1,22 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.person;
|
package eu.dnetlib.dhp.actionmanager.person;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.AfterAll;
|
import org.junit.jupiter.api.AfterAll;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
@ -20,11 +27,15 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob;
|
||||||
import eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson;
|
import eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson;
|
||||||
|
import eu.dnetlib.dhp.collection.orcid.model.Author;
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Person;
|
import eu.dnetlib.dhp.schema.oaf.Person;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
public class CreatePersonAS {
|
public class CreatePersonAS {
|
||||||
|
@ -156,7 +167,7 @@ public class CreatePersonAS {
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.anyMatch(
|
.anyMatch(
|
||||||
p -> p.getQualifier().getSchemename().equalsIgnoreCase("Scopus Author ID")
|
p -> p.getSchema().equalsIgnoreCase("Scopus Author ID")
|
||||||
&& p.getValue().equalsIgnoreCase("15119405200")));
|
&& p.getValue().equalsIgnoreCase("15119405200")));
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
|
|
|
@ -28,7 +28,6 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author miriam.baglioni
|
* @author miriam.baglioni
|
||||||
|
@ -271,17 +270,17 @@ public class CreateTAActionSetTest {
|
||||||
@Test
|
@Test
|
||||||
void testRelationsSourceTargetCouple() throws Exception {
|
void testRelationsSourceTargetCouple() throws Exception {
|
||||||
final String doi1 = "50|doi_________::"
|
final String doi1 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||||
final String doi2 = "50|doi_________::"
|
final String doi2 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||||
final String doi3 = "50|doi_________::"
|
final String doi3 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||||
final String doi4 = "50|doi_________::"
|
final String doi4 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||||
final String doi5 = "50|doi_________::"
|
final String doi5 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||||
final String doi6 = "50|doi_________::"
|
final String doi6 = "50|doi_________::"
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||||
|
|
||||||
String inputPath = getClass()
|
String inputPath = getClass()
|
||||||
.getResource(
|
.getResource(
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
|
@ -102,10 +101,7 @@ public class CreateASTest {
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
.map(aa -> ((Relation) aa.getPayload()));
|
.map(aa -> ((Relation) aa.getPayload()));
|
||||||
|
|
||||||
tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
|
Assertions.assertEquals(58, tmp.count());
|
||||||
tmp.foreach(r -> assertTrue(r.getSource().startsWith("20|ror") || r.getSource().startsWith("50|doi")));
|
|
||||||
tmp.foreach(r -> assertTrue(r.getTarget().startsWith("20|ror") || r.getTarget().startsWith("50|doi")));
|
|
||||||
Assertions.assertEquals(24, tmp.count());
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,7 +112,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
String inputPath = getClass()
|
String inputPath = getClass()
|
||||||
.getResource(
|
.getResource(
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
|
||||||
.getPath();
|
.getPath();
|
||||||
String blackListPath = getClass()
|
String blackListPath = getClass()
|
||||||
.getResource(
|
.getResource(
|
||||||
|
@ -198,7 +194,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
1, tmp
|
2, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getSource()
|
.getSource()
|
||||||
|
@ -211,7 +207,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
1, tmp
|
2, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
@ -232,13 +228,13 @@ public class CreateASTest {
|
||||||
"20|ror_________::" + IdentifierFactory
|
"20|ror_________::" + IdentifierFactory
|
||||||
.md5(
|
.md5(
|
||||||
PidCleaner
|
PidCleaner
|
||||||
.normalizePidValue("ROR", "https://ror.org/03265fv13")))
|
.normalizePidValue(PidType.doi.toString(), "https://ror.org/03265fv13")))
|
||||||
&& r.getSource().startsWith("50|doi"))
|
&& r.getSource().startsWith("50|doi"))
|
||||||
.count());
|
.count());
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
0, tmp
|
1, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
@ -272,10 +268,6 @@ public class CreateASTest {
|
||||||
.getResource(
|
.getResource(
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl")
|
"/eu/dnetlib/dhp/actionmanager/webcrawl")
|
||||||
.getPath();
|
.getPath();
|
||||||
String blackListPath = getClass()
|
|
||||||
.getResource(
|
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
CreateActionSetFromWebEntries
|
CreateActionSetFromWebEntries
|
||||||
.main(
|
.main(
|
||||||
|
@ -285,8 +277,7 @@ public class CreateASTest {
|
||||||
"-sourcePath",
|
"-sourcePath",
|
||||||
inputPath,
|
inputPath,
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/actionSet1",
|
workingDir.toString() + "/actionSet1"
|
||||||
"-blackListPath", blackListPath
|
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
|
@ -1,108 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.junit.jupiter.api.AfterAll;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author miriam.baglioni
|
|
||||||
* @Date 22/04/24
|
|
||||||
*/
|
|
||||||
public class RemoveFromASTest {
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
|
|
||||||
private static SparkSession spark;
|
|
||||||
|
|
||||||
private static Path workingDir;
|
|
||||||
private static final Logger log = LoggerFactory
|
|
||||||
.getLogger(RemoveFromASTest.class);
|
|
||||||
|
|
||||||
@BeforeAll
|
|
||||||
public static void beforeAll() throws IOException {
|
|
||||||
workingDir = Files
|
|
||||||
.createTempDirectory(RemoveFromASTest.class.getSimpleName());
|
|
||||||
log.info("using work dir {}", workingDir);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.setAppName(RemoveFromASTest.class.getSimpleName());
|
|
||||||
|
|
||||||
conf.setMaster("local[*]");
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
conf.set("hive.metastore.local", "true");
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
|
||||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
|
||||||
|
|
||||||
spark = SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(RemoveFromASTest.class.getSimpleName())
|
|
||||||
.config(conf)
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void afterAll() throws IOException {
|
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
|
||||||
spark.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testNumberofRelations() throws Exception {
|
|
||||||
|
|
||||||
String inputPath = getClass()
|
|
||||||
.getResource(
|
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/actionSet/")
|
|
||||||
.getPath();
|
|
||||||
String blackListPath = getClass()
|
|
||||||
.getResource(
|
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
RemoveRelationFromActionSet
|
|
||||||
.main(
|
|
||||||
new String[] {
|
|
||||||
"-isSparkSessionManaged",
|
|
||||||
Boolean.FALSE.toString(),
|
|
||||||
"-sourcePath",
|
|
||||||
inputPath,
|
|
||||||
"-outputPath",
|
|
||||||
workingDir.toString() + "/actionSet1",
|
|
||||||
"-blackListPath", blackListPath
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
JavaRDD<Relation> tmp = sc
|
|
||||||
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
|
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
|
||||||
.map(aa -> ((Relation) aa.getPayload()));
|
|
||||||
|
|
||||||
Assertions.assertEquals(22, tmp.count());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,103 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.gtr2;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
|
||||||
|
|
||||||
class Gtr2PublicationsIteratorTest {
|
|
||||||
|
|
||||||
private static final String baseURL = "https://gtr.ukri.org/gtr/api";
|
|
||||||
|
|
||||||
private static final HttpClientParams clientParams = new HttpClientParams();
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
public void testOne() throws Exception {
|
|
||||||
System.out.println("one publication");
|
|
||||||
|
|
||||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, null, null, clientParams);
|
|
||||||
|
|
||||||
if (iterator.hasNext()) {
|
|
||||||
final String res = iterator.next();
|
|
||||||
assertNotNull(res);
|
|
||||||
System.out.println(res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
public void testPaging() throws Exception {
|
|
||||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "2", "2", clientParams);
|
|
||||||
|
|
||||||
while (iterator.hasNext()) {
|
|
||||||
Thread.sleep(300);
|
|
||||||
final String res = iterator.next();
|
|
||||||
assertNotNull(res);
|
|
||||||
System.out.println(res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
public void testOnePage() throws Exception {
|
|
||||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "12", "12", clientParams);
|
|
||||||
final int count = iterateAndCount(iterator);
|
|
||||||
assertEquals(20, count);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
public void testIncrementalHarvestingNoRecords() throws Exception {
|
|
||||||
System.out.println("incremental Harvesting");
|
|
||||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, "2050-12-12T", "11", "13",
|
|
||||||
clientParams);
|
|
||||||
final int count = iterateAndCount(iterator);
|
|
||||||
assertEquals(0, count);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
public void testIncrementalHarvesting() throws Exception {
|
|
||||||
System.out.println("incremental Harvesting");
|
|
||||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, "2016-11-30", "11", "11", clientParams);
|
|
||||||
final int count = iterateAndCount(iterator);
|
|
||||||
assertEquals(20, count);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
public void testCompleteHarvesting() throws Exception {
|
|
||||||
System.out.println("testing complete harvesting");
|
|
||||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, null, null, clientParams);
|
|
||||||
// TryIndentXmlString indenter = new TryIndentXmlString();
|
|
||||||
// it.setEndAtPage(3);
|
|
||||||
|
|
||||||
while (iterator.hasNext()) {
|
|
||||||
final String res = iterator.next();
|
|
||||||
assertNotNull(res);
|
|
||||||
// System.out.println(res);
|
|
||||||
// Scanner keyboard = new Scanner(System.in);
|
|
||||||
// System.out.println("press enter for next record");
|
|
||||||
// keyboard.nextLine();
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private int iterateAndCount(final Iterator<String> iterator) throws Exception {
|
|
||||||
int i = 0;
|
|
||||||
while (iterator.hasNext()) {
|
|
||||||
assertNotNull(iterator.next());
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
System.out.println("Got " + i + " publications");
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,122 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.osf;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
import static org.junit.jupiter.api.Assertions.fail;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import org.dom4j.DocumentHelper;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
|
||||||
import eu.dnetlib.dhp.common.collection.HttpConnector2;
|
|
||||||
|
|
||||||
public class OsfPreprintsCollectorPluginTest {
|
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(OsfPreprintsCollectorPlugin.class);
|
|
||||||
|
|
||||||
private final String baseUrl = "https://api.osf.io/v2/preprints/";
|
|
||||||
|
|
||||||
private final int pageSize = 100;
|
|
||||||
|
|
||||||
private final ApiDescriptor api = new ApiDescriptor();
|
|
||||||
|
|
||||||
private OsfPreprintsCollectorPlugin plugin;
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
public void setUp() {
|
|
||||||
final HashMap<String, String> params = new HashMap<>();
|
|
||||||
params.put("pageSize", "" + this.pageSize);
|
|
||||||
|
|
||||||
this.api.setBaseUrl(this.baseUrl);
|
|
||||||
this.api.setParams(params);
|
|
||||||
|
|
||||||
this.plugin = new OsfPreprintsCollectorPlugin(new HttpClientParams());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
void test_one() throws CollectorException {
|
|
||||||
this.plugin
|
|
||||||
.collect(this.api, new AggregatorReport())
|
|
||||||
.limit(1)
|
|
||||||
.forEach(log::info);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
void test_limited() throws CollectorException {
|
|
||||||
final AtomicInteger i = new AtomicInteger(0);
|
|
||||||
final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());
|
|
||||||
|
|
||||||
stream.limit(2000).forEach(s -> {
|
|
||||||
Assertions.assertTrue(s.length() > 0);
|
|
||||||
i.incrementAndGet();
|
|
||||||
log.info(s);
|
|
||||||
});
|
|
||||||
|
|
||||||
log.info("{}", i.intValue());
|
|
||||||
Assertions.assertTrue(i.intValue() > 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
void test_all() throws CollectorException {
|
|
||||||
final AtomicLong i = new AtomicLong(0);
|
|
||||||
final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());
|
|
||||||
|
|
||||||
stream.forEach(s -> {
|
|
||||||
Assertions.assertTrue(s.length() > 0);
|
|
||||||
if ((i.incrementAndGet() % 1000) == 0) {
|
|
||||||
log.info("COLLECTED: {}", i.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
});
|
|
||||||
|
|
||||||
log.info("TOTAL: {}", i.get());
|
|
||||||
Assertions.assertTrue(i.get() > 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Disabled
|
|
||||||
void test_authentication_required() {
|
|
||||||
final HttpConnector2 connector = new HttpConnector2();
|
|
||||||
|
|
||||||
try {
|
|
||||||
final String res = connector
|
|
||||||
.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
|
||||||
System.out.println(res);
|
|
||||||
fail();
|
|
||||||
} catch (final Throwable e) {
|
|
||||||
|
|
||||||
System.out.println("**** ERROR: " + e.getMessage());
|
|
||||||
|
|
||||||
if ((e instanceof CollectorException) && e.getMessage().contains("401")) {
|
|
||||||
System.out.println(" XML: " + DocumentHelper.createDocument().getRootElement().detach());
|
|
||||||
}
|
|
||||||
|
|
||||||
assertTrue(e.getMessage().contains("401"));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testXML() {
|
|
||||||
final String xml = JsonUtils.convertToXML("{'next':null}");
|
|
||||||
System.out.println(xml);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,105 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
|
|
||||||
|
public class OsfPreprintCollectorTest {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(OsfPreprintCollectorTest.class);
|
||||||
|
|
||||||
|
private final String baseUrl = "https://api.osf.io/v2/preprints/";
|
||||||
|
|
||||||
|
// private final String requestHeaderMap = "";
|
||||||
|
// private final String authMethod = "";
|
||||||
|
// private final String authToken = "";
|
||||||
|
// private final String resultOutputFormat = "";
|
||||||
|
|
||||||
|
private final String queryParams = "filter:is_published:d=true";
|
||||||
|
|
||||||
|
private final String entityXpath = "/*/*[local-name()='data']";
|
||||||
|
|
||||||
|
private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";
|
||||||
|
|
||||||
|
private final String resumptionParam = "page";
|
||||||
|
private final String resumptionType = "scan";
|
||||||
|
private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";
|
||||||
|
|
||||||
|
private final String resultSizeParam = "page[size]";
|
||||||
|
private final String resultSizeValue = "100";
|
||||||
|
|
||||||
|
private final String resultFormatParam = "format";
|
||||||
|
private final String resultFormatValue = "json";
|
||||||
|
|
||||||
|
private final ApiDescriptor api = new ApiDescriptor();
|
||||||
|
private RestCollectorPlugin rcp;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
final HashMap<String, String> params = new HashMap<>();
|
||||||
|
params.put("resumptionType", this.resumptionType);
|
||||||
|
params.put("resumptionParam", this.resumptionParam);
|
||||||
|
params.put("resumptionXpath", this.resumptionXpath);
|
||||||
|
params.put("resultTotalXpath", this.resultTotalXpath);
|
||||||
|
params.put("resultFormatParam", this.resultFormatParam);
|
||||||
|
params.put("resultFormatValue", this.resultFormatValue);
|
||||||
|
params.put("resultSizeParam", this.resultSizeParam);
|
||||||
|
params.put("resultSizeValue", this.resultSizeValue);
|
||||||
|
params.put("queryParams", this.queryParams);
|
||||||
|
params.put("entityXpath", this.entityXpath);
|
||||||
|
|
||||||
|
this.api.setBaseUrl(this.baseUrl);
|
||||||
|
this.api.setParams(params);
|
||||||
|
|
||||||
|
this.rcp = new RestCollectorPlugin(new HttpClientParams());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
|
void test_limited() throws CollectorException {
|
||||||
|
final AtomicInteger i = new AtomicInteger(0);
|
||||||
|
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||||
|
|
||||||
|
stream.limit(2000).forEach(s -> {
|
||||||
|
Assertions.assertTrue(s.length() > 0);
|
||||||
|
i.incrementAndGet();
|
||||||
|
log.info(s);
|
||||||
|
});
|
||||||
|
|
||||||
|
log.info("{}", i.intValue());
|
||||||
|
Assertions.assertTrue(i.intValue() > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
|
void test_all() throws CollectorException {
|
||||||
|
final AtomicLong i = new AtomicLong(0);
|
||||||
|
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||||
|
|
||||||
|
stream.forEach(s -> {
|
||||||
|
Assertions.assertTrue(s.length() > 0);
|
||||||
|
if ((i.incrementAndGet() % 1000) == 0) {
|
||||||
|
log.info("COLLECTED: {}", i.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
log.info("TOTAL: {}", i.get());
|
||||||
|
Assertions.assertTrue(i.get() > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,10 +1,9 @@
|
||||||
{"DOI":"10.1021\/ac020069k","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/01f5ytq51","Status":"active","Confidence":1}]}
|
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
|
||||||
{"DOI":"10.1161\/01.cir.0000013846.72805.7e","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02pttbw34","Status":"active","Confidence":1}]}
|
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
|
||||||
{"DOI":"10.1161\/hy02t2.102992","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/00qqv6244","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/00p991c53","Status":"active","Confidence":1}]}
|
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
|
||||||
{"DOI":"10.1126\/science.1073633","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03xez1567","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/006w34k90","Status":"active","Confidence":1}]}
|
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
||||||
{"DOI":"10.1089\/10872910260066679","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/05cf8a891","Status":"active","Confidence":1}]}
|
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||||
{"DOI":"10.1108\/02656719610116117","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03mnm0t94","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/007tn5k56","Status":"active","Confidence":1}]}
|
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||||
{"DOI":"10.1080\/01443610050111986","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/001x4vz59","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/01tmqtf75","Status":"active","Confidence":1}]}
|
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||||
{"DOI":"10.1021\/cm020118+","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02cf1je33","Confidence":1,"Status":"inactive"},{"PID":"ROR","Value":"https:\/\/ror.org\/01hvx5h04","Confidence":1,"Status":"active"}]}
|
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
||||||
{"DOI":"10.1161\/hc1202.104524","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/040r8fr65","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/04fctr677","Status":"active","Confidence":1}]}
|
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
||||||
{"DOI":"10.1021\/ma011134f","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/04tj63d06","Status":"active","Confidence":1}]}
|
|
|
@ -1,9 +0,0 @@
|
||||||
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
|
|
||||||
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
|
|
||||||
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
|
|
||||||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
|
||||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
|
||||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
|
||||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
|
||||||
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
|
||||||
{"DOI": "https://doi.org/10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
|
|
||||||
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}
|
|
||||||
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}, {"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}
|
|
||||||
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
|
|
||||||
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}, {"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}
|
|
||||||
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"Value": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/03265fv13", "Confidence": 1}]}
|
|
||||||
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}
|
|
||||||
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/03bea9k73", "Confidence": 1}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04xfq0f34", "Confidence": 1}]}
|
|
||||||
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
|
|
||||||
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"Value": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"Value": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04agmb972", "Confidence": 1}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/05m7pjf47", "Confidence": 1}]}
|
|
||||||
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"Value": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"PID":"ROR","Status":"active","Value": "https://ror.org/03265fv13", "Confidence": 1}]}
|
|
|
@ -1 +0,0 @@
|
||||||
{"doi":"https://doi.org/10.1098/rstl.1684.0023","OpenAlexId":"https://openalex.org/W2124362779"}
|
|
|
@ -1,44 +1,15 @@
|
||||||
{"classification": "Signaling protein", "pdb": "5NM4", "deposition_date": "2017-04-05", "title": "A2a adenosine receptor room-temperature structure determined by serial Femtosecond crystallography", "Keywords": ["Oom-temperature", " serial crystallography", " signaling protein"], "authors": ["T.weinert", "R.cheng", "D.james", "D.gashi", "P.nogly", "K.jaeger", "M.hennig", "", "J.standfuss"], "pmid": "28912485", "doi": "10.1038/S41467-017-00630-4"}
|
{"pdb": "1CW0", "title": "crystal structure analysis of very short patch repair (vsr) endonuclease in complex with a duplex dna", "authors": ["S.E.Tsutakawa", "H.Jingami", "K.Morikawa"], "doi": "10.1016/S0092-8674(00)81550-0", "pmid": "10612397"}
|
||||||
{"classification": "Oxidoreductase/oxidoreductase inhibitor", "pdb": "4KN3", "deposition_date": "2013-05-08", "title": "Structure of the y34ns91g double mutant of dehaloperoxidase from Amphitrite ornata with 2,4,6-trichlorophenol", "Keywords": ["Lobin", " oxygen storage", " peroxidase", " oxidoreductase", " oxidoreductase-", "Oxidoreductase inhibitor complex"], "authors": ["C.wang", "L.lovelace", "L.lebioda"], "pmid": "23952341", "doi": "10.1021/BI400627W"}
|
{"pdb": "2CWW", "title": "crystal structure of thermus thermophilus ttha1280, a putative sam- dependent rna methyltransferase, in complex with s-adenosyl-l- homocysteine", "authors": ["A.A.Pioszak", "K.Murayama", "N.Nakagawa", "A.Ebihara", "S.Kuramitsu", "M.Shirouzu", "S.Yokoyama", "Riken Structural Genomics/proteomics Initiative (Rsgi)"], "doi": "10.1107/S1744309105029842", "pmid": "16511182"}
|
||||||
{"classification": "Transport protein", "pdb": "8HKM", "deposition_date": "2022-11-27", "title": "Ion channel", "Keywords": ["On channel", " transport protein"], "authors": ["D.h.jiang", "J.t.zhang"], "pmid": "37494189", "doi": "10.1016/J.CELREP.2023.112858"}
|
{"pdb": "6CWE", "title": "structure of alpha-gsa[8,6p] bound by cd1d and in complex with the va14vb8.2 tcr", "authors": ["J.Wang", "D.Zajonc"], "doi": null, "pmid": null}
|
||||||
{"classification": "Signaling protein", "pdb": "6JT1", "deposition_date": "2019-04-08", "title": "Structure of human soluble guanylate cyclase in the heme oxidised State", "Keywords": ["Oluble guanylate cyclase", " signaling protein"], "authors": ["L.chen", "Y.kang", "R.liu", "J.-x.wu"], "pmid": "31514202", "doi": "10.1038/S41586-019-1584-6"}
|
{"pdb": "5CWS", "title": "crystal structure of the intact chaetomium thermophilum nsp1-nup49- nup57 channel nucleoporin heterotrimer bound to its nic96 nuclear pore complex attachment site", "authors": ["C.J.Bley", "S.Petrovic", "M.Paduch", "V.Lu", "A.A.Kossiakoff", "A.Hoelz"], "doi": "10.1126/SCIENCE.AAC9176", "pmid": "26316600"}
|
||||||
{"classification": "Immune system", "pdb": "7OW6", "deposition_date": "2021-06-16", "title": "Crystal structure of a tcr in complex with hla-a*11:01 bound to kras G12d peptide (vvvgadgvgk)", "Keywords": ["La", " kras", " tcr", " immune system"], "authors": ["V.karuppiah", "R.a.robinson"], "doi": "10.1038/S41467-022-32811-1"}
|
{"pdb": "5CWE", "title": "structure of cyp107l2 from streptomyces avermitilis with lauric acid", "authors": ["T.-V.Pham", "S.-H.Han", "J.-H.Kim", "D.-H.Kim", "L.-W.Kang"], "doi": null, "pmid": null}
|
||||||
{"classification": "Biosynthetic protein", "pdb": "5EQ8", "deposition_date": "2015-11-12", "title": "Crystal structure of medicago truncatula histidinol-phosphate Phosphatase (mthpp) in complex with l-histidinol", "Keywords": ["Istidine biosynthesis", " metabolic pathways", " dimer", " plant", "", "Biosynthetic protein"], "authors": ["M.ruszkowski", "Z.dauter"], "pmid": "26994138", "doi": "10.1074/JBC.M115.708727"}
|
{"pdb": "7CW4", "title": "acetyl-coa acetyltransferase from bacillus cereus atcc 14579", "authors": ["J.Hong", "K.J.Kim"], "doi": "10.1016/J.BBRC.2020.09.048", "pmid": "32972748"}
|
||||||
{"classification": "De novo protein", "pdb": "8CWA", "deposition_date": "2022-05-18", "title": "Solution nmr structure of 8-residue rosetta-designed cyclic peptide D8.21 in cdcl3 with cis/trans switching (tc conformation, 53%)", "Keywords": ["Yclic peptide", " non natural amino acids", " cis/trans", " switch peptides", "", "De novo design", "Membrane permeability", "De novo protein"], "authors": ["T.a.ramelot", "R.tejero", "G.t.montelione"], "pmid": "36041435", "doi": "10.1016/J.CELL.2022.07.019"}
|
{"pdb": "2CWP", "title": "crystal structure of metrs related protein from pyrococcus horikoshii", "authors": ["K.Murayama", "M.Kato-Murayama", "M.Shirouzu", "S.Yokoyama", "Riken StructuralGenomics/proteomics Initiative (Rsgi)"], "doi": null, "pmid": null}
|
||||||
{"classification": "Hydrolase", "pdb": "3R6M", "deposition_date": "2011-03-21", "title": "Crystal structure of vibrio parahaemolyticus yeaz", "Keywords": ["Ctin/hsp70 nucleotide-binding fold", " bacterial resuscitation", " viable", "But non-culturable state", "Resuscitation promoting factor", "Ygjd", "", "Yjee", "Vibrio parahaemolyticus", "Hydrolase"], "authors": ["A.roujeinikova", "I.aydin"], "pmid": "21858042", "doi": "10.1371/JOURNAL.PONE.0023245"}
|
{"pdb": "2CW7", "title": "crystal structure of intein homing endonuclease ii", "authors": ["H.Matsumura", "H.Takahashi", "T.Inoue", "H.Hashimoto", "M.Nishioka", "S.Fujiwara", "M.Takagi", "T.Imanaka", "Y.Kai"], "doi": "10.1002/PROT.20858", "pmid": "16493661"}
|
||||||
{"classification": "Hydrolase", "pdb": "2W5J", "deposition_date": "2008-12-10", "title": "Structure of the c14-rotor ring of the proton translocating Chloroplast atp synthase", "Keywords": ["Ydrolase", " chloroplast", " atp synthase", " lipid-binding", " cf(0)", " membrane", "", "Transport", "Formylation", "Energy transduction", "Hydrogen ion transport", "", "Ion transport", "Transmembrane", "Membrane protein"], "authors": ["M.vollmar", "D.schlieper", "M.winn", "C.buechner", "G.groth"], "pmid": "19423706", "doi": "10.1074/JBC.M109.006916"}
|
{"pdb": "1CWU", "title": "brassica napus enoyl acp reductase a138g mutant complexed with nad+ and thienodiazaborine", "authors": ["A.Roujeinikova", "J.B.Rafferty", "D.W.Rice"], "doi": "10.1074/JBC.274.43.30811", "pmid": "10521472"}
|
||||||
{"classification": "De novo protein", "pdb": "4GLU", "deposition_date": "2012-08-14", "title": "Crystal structure of the mirror image form of vegf-a", "Keywords": ["-protein", " covalent dimer", " cysteine knot protein", " growth factor", " de", "Novo protein"], "authors": ["K.mandal", "M.uppalapati", "D.ault-riche", "J.kenney", "J.lowitz", "S.sidhu", "", "S.b.h.kent"], "pmid": "22927390", "doi": "10.1073/PNAS.1210483109"}
|
{"pdb": "3CWN", "title": "escherichia coli transaldolase b mutant f178y", "authors": ["T.Sandalova", "G.Schneider", "A.Samland"], "doi": "10.1074/JBC.M803184200", "pmid": "18687684"}
|
||||||
{"classification": "Hydrolase/hydrolase inhibitor", "pdb": "3WYL", "deposition_date": "2014-09-01", "title": "Crystal structure of the catalytic domain of pde10a complexed with 5- Methoxy-3-(1-phenyl-1h-pyrazol-5-yl)-1-(3-(trifluoromethyl)phenyl) Pyridazin-4(1h)-one", "Keywords": ["Ydrolase-hydrolase inhibitor complex"], "authors": ["H.oki", "Y.hayano"], "pmid": "25384088", "doi": "10.1021/JM5013648"}
|
{"pdb": "1CWL", "title": "human cyclophilin a complexed with 4 4-hydroxy-meleu cyclosporin", "authors": ["V.Mikol", "J.Kallen", "P.Taylor", "M.D.Walkinshaw"], "doi": "10.1006/JMBI.1998.2108", "pmid": "9769216"}
|
||||||
{"classification": "Isomerase", "pdb": "5BOR", "deposition_date": "2015-05-27", "title": "Structure of acetobacter aceti pure-s57c, sulfonate form", "Keywords": ["Cidophile", " pure", " purine biosynthesis", " isomerase"], "authors": ["K.l.sullivan", "T.j.kappock"]}
|
{"pdb": "3CW2", "title": "crystal structure of the intact archaeal translation initiation factor 2 from sulfolobus solfataricus .", "authors": ["E.A.Stolboushkina", "S.V.Nikonov", "A.D.Nikulin", "U.Blaesi", "D.J.Manstein", "R.V.Fedorov", "M.B.Garber", "O.S.Nikonov"], "doi": "10.1016/J.JMB.2008.07.039", "pmid": "18675278"}
|
||||||
{"classification": "Hydrolase", "pdb": "1X0C", "deposition_date": "2005-03-17", "title": "Improved crystal structure of isopullulanase from aspergillus niger Atcc 9642", "Keywords": ["Ullulan", " glycoside hydrolase family 49", " glycoprotein", " hydrolase"], "authors": ["M.mizuno", "T.tonozuka", "A.yamamura", "Y.miyasaka", "H.akeboshi", "S.kamitori", "", "A.nishikawa", "Y.sakano"], "pmid": "18155243", "doi": "10.1016/J.JMB.2007.11.098"}
|
{"pdb": "3CW9", "title": "4-chlorobenzoyl-coa ligase/synthetase in the thioester-forming conformation, bound to 4-chlorophenacyl-coa", "authors": ["A.S.Reger", "J.Cao", "R.Wu", "D.Dunaway-Mariano", "A.M.Gulick"], "doi": "10.1021/BI800696Y", "pmid": "18620418"}
|
||||||
{"classification": "Oxidoreductase", "pdb": "7CUP", "deposition_date": "2020-08-23", "title": "Structure of 2,5-dihydroxypridine dioxygenase from pseudomonas putida Kt2440", "Keywords": ["On-heme dioxygenase", " oxidoreductase"], "authors": ["G.q.liu", "H.z.tang"]}
|
{"pdb": "3CWU", "title": "crystal structure of an alka host/guest complex 2'-fluoro-2'-deoxy-1, n6-ethenoadenine:thymine base pair", "authors": ["B.R.Bowman", "S.Lee", "S.Wang", "G.L.Verdine"], "doi": "10.1016/J.STR.2008.04.012", "pmid": "18682218"}
|
||||||
{"classification": "Ligase", "pdb": "1VCN", "deposition_date": "2004-03-10", "title": "Crystal structure of t.th. hb8 ctp synthetase complex with sulfate Anion", "Keywords": ["Etramer", " riken structural genomics/proteomics initiative", " rsgi", "", "Structural genomics", "Ligase"], "authors": ["M.goto", "Riken structural genomics/proteomics initiative (rsgi)"], "pmid": "15296735", "doi": "10.1016/J.STR.2004.05.013"}
|
{"pdb": "5CWF", "title": "crystal structure of de novo designed helical repeat protein dhr8", "authors": ["G.Bhabha", "D.C.Ekiert"], "doi": "10.1038/NATURE16162", "pmid": "26675729"}
|
||||||
{"classification": "Transferase/transferase inhibitor", "pdb": "6C9V", "deposition_date": "2018-01-28", "title": "Mycobacterium tuberculosis adenosine kinase bound to (2r,3s,4r,5r)-2- (hydroxymethyl)-5-(6-(4-phenylpiperazin-1-yl)-9h-purin-9-yl) Tetrahydrofuran-3,4-diol", "Keywords": ["Ucleoside analog", " complex", " inhibitor", " structural genomics", " psi-2", "", "Protein structure initiative", "Tb structural genomics consortium", "", "Tbsgc", "Transferase-transferase inhibitor complex"], "authors": ["R.a.crespo", "Tb structural genomics consortium (tbsgc)"], "pmid": "31002508", "doi": "10.1021/ACS.JMEDCHEM.9B00020"}
|
|
||||||
{"classification": "De novo protein", "pdb": "4LPY", "deposition_date": "2013-07-16", "title": "Crystal structure of tencon variant g10", "Keywords": ["Ibronectin type iii fold", " alternate scaffold", " de novo protein"], "authors": ["A.teplyakov", "G.obmolova", "G.l.gilliland"], "pmid": "24375666", "doi": "10.1002/PROT.24502"}
|
|
||||||
{"classification": "Isomerase", "pdb": "2Y88", "deposition_date": "2011-02-03", "title": "Crystal structure of mycobacterium tuberculosis phosphoribosyl Isomerase (variant d11n) with bound prfar", "Keywords": ["Romatic amino acid biosynthesis", " isomerase", " tim-barrel", " histidine", "Biosynthesis", "Tryptophan biosynthesis"], "authors": ["J.kuper", "A.v.due", "A.geerlof", "M.wilmanns"], "pmid": "21321225", "doi": "10.1073/PNAS.1015996108"}
|
|
||||||
{"classification": "Unknown function", "pdb": "1SR0", "deposition_date": "2004-03-22", "title": "Crystal structure of signalling protein from sheep(sps-40) at 3.0a Resolution using crystal grown in the presence of polysaccharides", "Keywords": ["Ignalling protein", " involution", " unknown function"], "authors": ["D.b.srivastava", "A.s.ethayathulla", "N.singh", "J.kumar", "S.sharma", "T.p.singh"]}
|
|
||||||
{"classification": "Dna binding protein", "pdb": "3RH2", "deposition_date": "2011-04-11", "title": "Crystal structure of a tetr-like transcriptional regulator (sama_0099) From shewanella amazonensis sb2b at 2.42 a resolution", "Keywords": ["Na/rna-binding 3-helical bundle", " structural genomics", " joint center", "For structural genomics", "Jcsg", "Protein structure initiative", "Psi-", "Biology", "Dna binding protein"], "authors": ["Joint center for structural genomics (jcsg)"]}
|
|
||||||
{"classification": "Transferase", "pdb": "2WK5", "deposition_date": "2009-06-05", "title": "Structural features of native human thymidine phosphorylase And in complex with 5-iodouracil", "Keywords": ["Lycosyltransferase", " developmental protein", " angiogenesis", "", "5-iodouracil", "Growth factor", "Enzyme kinetics", "", "Differentiation", "Disease mutation", "Thymidine", "Phosphorylase", "Chemotaxis", "Transferase", "Mutagenesis", "", "Polymorphism"], "authors": ["E.mitsiki", "A.c.papageorgiou", "S.iyer", "N.thiyagarajan", "S.h.prior", "", "D.sleep", "C.finnis", "K.r.acharya"], "pmid": "19555658", "doi": "10.1016/J.BBRC.2009.06.104"}
|
|
||||||
{"classification": "Hydrolase", "pdb": "3P9Y", "deposition_date": "2010-10-18", "title": "Crystal structure of the drosophila melanogaster ssu72-pctd complex", "Keywords": ["Hosphatase", " cis proline", " lmw ptp-like fold", " rna polymerase ii ctd", "", "Hydrolase"], "authors": ["J.w.werner-allen", "P.zhou"], "pmid": "21159777", "doi": "10.1074/JBC.M110.197129"}
|
|
||||||
{"classification": "Recombination/dna", "pdb": "6OEO", "deposition_date": "2019-03-27", "title": "Cryo-em structure of mouse rag1/2 nfc complex (dna1)", "Keywords": ["(d)j recombination", " dna transposition", " rag", " scid", " recombination", "", "Recombination-dna complex"], "authors": ["X.chen", "Y.cui", "Z.h.zhou", "W.yang", "M.gellert"], "pmid": "32015552", "doi": "10.1038/S41594-019-0363-2"}
|
|
||||||
{"classification": "Hydrolase", "pdb": "4ECA", "deposition_date": "1997-02-21", "title": "Asparaginase from e. coli, mutant t89v with covalently bound aspartate", "Keywords": ["Ydrolase", " acyl-enzyme intermediate", " threonine amidohydrolase"], "authors": ["G.j.palm", "J.lubkowski", "A.wlodawer"], "pmid": "8706862", "doi": "10.1016/0014-5793(96)00660-6"}
|
|
||||||
{"classification": "Transcription/protein binding", "pdb": "3UVX", "deposition_date": "2011-11-30", "title": "Crystal structure of the first bromodomain of human brd4 in complex With a diacetylated histone 4 peptide (h4k12ack16ac)", "Keywords": ["Romodomain", " bromodomain containing protein 4", " cap", " hunk1", " mcap", "", "Mitotic chromosome associated protein", "Peptide complex", "Structural", "Genomics consortium", "Sgc", "Transcription-protein binding complex"], "authors": ["P.filippakopoulos", "S.picaud", "T.keates", "E.ugochukwu", "F.von delft", "", "C.h.arrowsmith", "A.m.edwards", "J.weigelt", "C.bountra", "S.knapp", "Structural", "Genomics consortium (sgc)"], "pmid": "22464331", "doi": "10.1016/J.CELL.2012.02.013"}
|
|
||||||
{"classification": "Membrane protein", "pdb": "1TLZ", "deposition_date": "2004-06-10", "title": "Tsx structure complexed with uridine", "Keywords": ["Ucleoside transporter", " beta barrel", " uridine", " membrane", "Protein"], "authors": ["J.ye", "B.van den berg"], "pmid": "15272310", "doi": "10.1038/SJ.EMBOJ.7600330"}
|
|
||||||
{"classification": "Dna binding protein", "pdb": "7AZD", "deposition_date": "2020-11-16", "title": "Dna polymerase sliding clamp from escherichia coli with peptide 20 Bound", "Keywords": ["Ntibacterial drug", " dna binding protein"], "authors": ["C.monsarrat", "G.compain", "C.andre", "I.martiel", "S.engilberge", "V.olieric", "", "P.wolff", "K.brillet", "M.landolfo", "C.silva da veiga", "J.wagner", "G.guichard", "", "D.y.burnouf"], "pmid": "34806883", "doi": "10.1021/ACS.JMEDCHEM.1C00918"}
|
|
||||||
{"classification": "Transferase", "pdb": "5N3K", "deposition_date": "2017-02-08", "title": "Camp-dependent protein kinase a from cricetulus griseus in complex With fragment like molecule o-guanidino-l-homoserine", "Keywords": ["Ragment", " complex", " transferase", " serine threonine kinase", " camp", "", "Kinase", "Pka"], "authors": ["C.siefker", "A.heine", "G.klebe"]}
|
|
||||||
{"classification": "Biosynthetic protein", "pdb": "8H52", "deposition_date": "2022-10-11", "title": "Crystal structure of helicobacter pylori carboxyspermidine Dehydrogenase in complex with nadp", "Keywords": ["Arboxyspermidine dehydrogenase", " biosynthetic protein"], "authors": ["K.y.ko", "S.c.park", "S.y.cho", "S.i.yoon"], "pmid": "36283333", "doi": "10.1016/J.BBRC.2022.10.049"}
|
|
||||||
{"classification": "Metal binding protein", "pdb": "6DYC", "deposition_date": "2018-07-01", "title": "Co(ii)-bound structure of the engineered cyt cb562 variant, ch3", "Keywords": ["Esigned protein", " 4-helix bundle", " electron transport", " metal binding", "Protein"], "authors": ["F.a.tezcan", "J.rittle"], "pmid": "30778140", "doi": "10.1038/S41557-019-0218-9"}
|
|
||||||
{"classification": "Protein fibril", "pdb": "6A6B", "deposition_date": "2018-06-27", "title": "Cryo-em structure of alpha-synuclein fiber", "Keywords": ["Lpha-syn fiber", " parkinson disease", " protein fibril"], "authors": ["Y.w.li", "C.y.zhao", "F.luo", "Z.liu", "X.gui", "Z.luo", "X.zhang", "D.li", "C.liu", "X.li"], "pmid": "30065316", "doi": "10.1038/S41422-018-0075-X"}
|
|
||||||
{"classification": "Dna", "pdb": "7D5E", "deposition_date": "2020-09-25", "title": "Left-handed g-quadruplex containing two bulges", "Keywords": ["-quadruplex", " bulge", " dna", " left-handed"], "authors": ["P.das", "A.maity", "K.h.ngo", "F.r.winnerdy", "B.bakalar", "Y.mechulam", "E.schmitt", "", "A.t.phan"], "pmid": "33503265", "doi": "10.1093/NAR/GKAA1259"}
|
|
||||||
{"classification": "Transferase", "pdb": "3RSY", "deposition_date": "2011-05-02", "title": "Cellobiose phosphorylase from cellulomonas uda in complex with sulfate And glycerol", "Keywords": ["H94", " alpha barrel", " cellobiose phosphorylase", " disaccharide", "Phosphorylase", "Transferase"], "authors": ["A.van hoorebeke", "J.stout", "W.soetaert", "J.van beeumen", "T.desmet", "S.savvides"]}
|
|
||||||
{"classification": "Oxidoreductase", "pdb": "7MCI", "deposition_date": "2021-04-02", "title": "Mofe protein from azotobacter vinelandii with a sulfur-replenished Cofactor", "Keywords": ["Zotobacter vinelandii", " mofe-protein", " nitrogenase", " oxidoreductase"], "authors": ["W.kang", "C.lee", "Y.hu", "M.w.ribbe"], "doi": "10.1038/S41929-022-00782-7"}
|
|
||||||
{"classification": "Dna", "pdb": "1XUW", "deposition_date": "2004-10-26", "title": "Structural rationalization of a large difference in rna affinity Despite a small difference in chemistry between two 2'-o-modified Nucleic acid analogs", "Keywords": ["Na mimetic methylcarbamate amide analog", " dna"], "authors": ["R.pattanayek", "L.sethaphong", "C.pan", "M.prhavc", "T.p.prakash", "M.manoharan", "", "M.egli"], "pmid": "15547979", "doi": "10.1021/JA044637K"}
|
|
||||||
{"classification": "Lyase", "pdb": "7C0D", "deposition_date": "2020-05-01", "title": "Crystal structure of azospirillum brasilense l-2-keto-3-deoxyarabonate Dehydratase (hydroxypyruvate-bound form)", "Keywords": ["-2-keto-3-deoxyarabonate dehydratase", " lyase"], "authors": ["Y.watanabe", "S.watanabe"], "pmid": "32697085", "doi": "10.1021/ACS.BIOCHEM.0C00515"}
|
|
||||||
{"classification": "Signaling protein", "pdb": "5LYK", "deposition_date": "2016-09-28", "title": "Crystal structure of intracellular b30.2 domain of btn3a1 bound to Citrate", "Keywords": ["30.2", " butyrophilin", " signaling protein"], "authors": ["F.mohammed", "A.t.baker", "M.salim", "B.e.willcox"], "pmid": "28862425", "doi": "10.1021/ACSCHEMBIO.7B00694"}
|
|
||||||
{"classification": "Toxin", "pdb": "4IZL", "deposition_date": "2013-01-30", "title": "Structure of the n248a mutant of the panton-valentine leucocidin s Component from staphylococcus aureus", "Keywords": ["I-component leucotoxin", " staphylococcus aureus", " s component", "Leucocidin", "Beta-barrel pore forming toxin", "Toxin"], "authors": ["L.maveyraud", "B.j.laventie", "G.prevost", "L.mourey"], "pmid": "24643034", "doi": "10.1371/JOURNAL.PONE.0092094"}
|
|
||||||
{"classification": "Dna", "pdb": "6F3C", "deposition_date": "2017-11-28", "title": "The cytotoxic [pt(h2bapbpy)] platinum complex interacting with the Cgtacg hexamer", "Keywords": ["Rug-dna complex", " four-way junction", " dna"], "authors": ["M.ferraroni", "C.bazzicalupi", "P.gratteri", "F.papi"], "pmid": "31046177", "doi": "10.1002/ANIE.201814532"}
|
|
||||||
{"classification": "Signaling protein/inhibitor", "pdb": "4L5M", "deposition_date": "2013-06-11", "title": "Complexe of arno sec7 domain with the protein-protein interaction Inhibitor n-(4-hydroxy-2,6-dimethylphenyl)benzenesulfonamide at ph6.5", "Keywords": ["Ec-7domain", " signaling protein-inhibitor complex"], "authors": ["F.hoh", "J.rouhana"], "pmid": "24112024", "doi": "10.1021/JM4009357"}
|
|
||||||
{"classification": "Signaling protein", "pdb": "5I6J", "deposition_date": "2016-02-16", "title": "Crystal structure of srgap2 f-barx", "Keywords": ["Rgap2", " f-bar", " fx", " signaling protein"], "authors": ["M.sporny", "J.guez-haddad", "M.n.isupov", "Y.opatowsky"], "pmid": "28333212", "doi": "10.1093/MOLBEV/MSX094"}
|
|
||||||
{"classification": "Metal binding protein", "pdb": "1Q80", "deposition_date": "2003-08-20", "title": "Solution structure and dynamics of nereis sarcoplasmic calcium binding Protein", "Keywords": ["Ll-alpha", " metal binding protein"], "authors": ["G.rabah", "R.popescu", "J.a.cox", "Y.engelborghs", "C.t.craescu"], "pmid": "15819893", "doi": "10.1111/J.1742-4658.2005.04629.X"}
|
|
||||||
{"classification": "Transferase", "pdb": "1TW1", "deposition_date": "2004-06-30", "title": "Beta-1,4-galactosyltransferase mutant met344his (m344h-gal-t1) complex With udp-galactose and magnesium", "Keywords": ["Et344his mutation; closed conformation; mn binding", " transferase"], "authors": ["B.ramakrishnan", "E.boeggeman", "P.k.qasba"], "pmid": "15449940", "doi": "10.1021/BI049007+"}
|
|
||||||
{"classification": "Rna", "pdb": "2PN4", "deposition_date": "2007-04-23", "title": "Crystal structure of hepatitis c virus ires subdomain iia", "Keywords": ["Cv", " ires", " subdoamin iia", " rna", " strontium", " hepatitis"], "authors": ["Q.zhao", "Q.han", "C.r.kissinger", "P.a.thompson"], "pmid": "18391410", "doi": "10.1107/S0907444908002011"}
|
|
|
@ -1,36 +1,6 @@
|
||||||
{"pid": " Q6GZX4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 43"}], "title": "Putative transcription factor 001R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
{"pid": "Q6GZX4", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 41."}], "title": "Putative transcription factor 001R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
{"pid": " Q6GZX3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 45"}], "title": "Uncharacterized protein 002L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
{"pid": "Q6GZX3", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 42."}], "title": "Uncharacterized protein 002L;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
{"pid": " Q197F8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 29"}], "title": "Uncharacterized protein 002R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
{"pid": "Q197F8", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 27."}], "title": "Uncharacterized protein 002R;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
|
||||||
{"pid": " Q197F7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 23"}], "title": "Uncharacterized protein 003L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
{"pid": "Q197F7", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 23."}], "title": "Uncharacterized protein 003L;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
|
||||||
{"pid": " Q6GZX2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 3R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
{"pid": "Q6GZX2", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 36."}], "title": "Uncharacterized protein 3R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
{"pid": " Q6GZX1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 38"}], "title": "Uncharacterized protein 004R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
{"pid": "Q6GZX1", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 34."}], "title": "Uncharacterized protein 004R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
{"pid": " Q197F5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 005L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
|
||||||
{"pid": " Q6GZX0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 47"}], "title": "Uncharacterized protein 005R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q91G88", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-06-28", "date_info": "entry version 53"}], "title": "Putative KilA-N domain-containing protein 006L", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
|
||||||
{"pid": " Q6GZW9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 34"}], "title": "Uncharacterized protein 006R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZW8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 32"}], "title": "Uncharacterized protein 007R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q197F3", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 007R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
|
||||||
{"pid": " Q197F2", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 22"}], "title": "Uncharacterized protein 008L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
|
||||||
{"pid": " Q6GZW6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 67"}], "title": "Putative helicase 009L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q91G85", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 38"}], "title": "Uncharacterized protein 009R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
|
||||||
{"pid": " Q6GZW5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 010R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q197E9", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 011L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
|
||||||
{"pid": " Q6GZW4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 011R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZW3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 012L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q197E7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 37"}], "title": "Uncharacterized protein IIV3-013L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
|
||||||
{"pid": " Q6GZW2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 30"}], "title": "Uncharacterized protein 013R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZW1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 014R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZW0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 50"}], "title": "Uncharacterized protein 015R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZV8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 017L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZV7", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 018L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZV6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 87"}], "title": "Putative serine/threonine-protein kinase 019R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZV5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 40"}], "title": "Uncharacterized protein 020R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZV4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 021L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q197D8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-12-14", "date_info": "entry version 35"}], "title": "Transmembrane protein 022L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
|
||||||
{"pid": " Q6GZV2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 023R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q197D7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 25"}], "title": "Uncharacterized protein 023R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
|
||||||
{"pid": " Q6GZV1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 024R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q197D5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 24"}], "title": "Uncharacterized protein 025R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
|
||||||
{"pid": " Q91G70", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 026R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
|
||||||
{"pid": " Q6GZU9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 49"}], "title": "Uncharacterized protein 027R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
||||||
{"pid": " Q6GZU8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 55"}], "title": "Uncharacterized protein 028R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
|
@ -26,7 +26,7 @@ class MAGMappingTest {
|
||||||
@Test
|
@Test
|
||||||
def mappingMagType(): Unit = {
|
def mappingMagType(): Unit = {
|
||||||
|
|
||||||
checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = true, "Other literature type")
|
checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = false, "Other literature type")
|
||||||
checkResult[Publication](
|
checkResult[Publication](
|
||||||
MagUtility.createResultFromType(Some("BookChapter"), null),
|
MagUtility.createResultFromType(Some("BookChapter"), null),
|
||||||
invisible = false,
|
invisible = false,
|
||||||
|
|
|
@ -70,8 +70,9 @@ public class PrepareRelatedProjectsJob {
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
.loadRelations(graphPath, spark)
|
.loadRelations(graphPath, spark)
|
||||||
.filter((FilterFunction<Relation>) r -> ModelConstants.RESULT_PROJECT.equals(r.getRelType()))
|
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
|
||||||
.filter((FilterFunction<Relation>) r -> !BrokerConstants.IS_MERGED_IN_CLASS.equals(r.getRelClass()))
|
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||||
|
.filter((FilterFunction<Relation>) r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||||
|
|
||||||
|
|
|
@ -53,7 +53,7 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
return source
|
return source
|
||||||
.getSubjects()
|
.getPids()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(s -> !existingSubjects.contains(subjectAsString(s)))
|
.filter(s -> !existingSubjects.contains(subjectAsString(s)))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
|
@ -1,60 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
|
||||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
|
||||||
|
|
||||||
public class EnrichMoreSubjectTest {
|
|
||||||
|
|
||||||
final EnrichMoreSubject matcher = new EnrichMoreSubject();
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
void setUp() throws Exception {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testFindDifferences_1() {
|
|
||||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
|
||||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
|
||||||
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
|
|
||||||
assertTrue(list.isEmpty());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testFindDifferences_2() {
|
|
||||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
|
||||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
|
||||||
source.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
|
|
||||||
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
|
|
||||||
assertEquals(1, list.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testFindDifferences_3() {
|
|
||||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
|
||||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
|
||||||
target.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
|
|
||||||
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
|
|
||||||
assertTrue(list.isEmpty());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testFindDifferences_4() {
|
|
||||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
|
||||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
|
||||||
source.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
|
|
||||||
target.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
|
|
||||||
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
|
|
||||||
assertTrue(list.isEmpty());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -17,6 +17,45 @@ import eu.dnetlib.pace.tree.support.TreeStats;
|
||||||
|
|
||||||
class DecisionTreeTest {
|
class DecisionTreeTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testJPath() throws IOException {
|
||||||
|
|
||||||
|
DedupConfig conf = DedupConfig
|
||||||
|
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json")));
|
||||||
|
|
||||||
|
final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json"));
|
||||||
|
|
||||||
|
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
|
||||||
|
System.out.println("row = " + row);
|
||||||
|
Assertions.assertNotNull(row);
|
||||||
|
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||||
|
|
||||||
|
System.out.println("row = " + row.getAs("countrytitle"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void jsonToModelTest() throws IOException {
|
||||||
|
DedupConfig conf = DedupConfig
|
||||||
|
.load(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkOpenorgsDedupTest.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||||
|
|
||||||
|
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||||
|
|
||||||
|
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
// to check that the same parsing returns the same row
|
||||||
|
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
|
||||||
|
Assertions.assertEquals(row, row1);
|
||||||
|
System.out.println("row = " + row);
|
||||||
|
Assertions.assertNotNull(row);
|
||||||
|
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void organizationDecisionTreeTest() throws Exception {
|
void organizationDecisionTreeTest() throws Exception {
|
||||||
DedupConfig conf = DedupConfig
|
DedupConfig conf = DedupConfig
|
||||||
|
|
|
@ -452,18 +452,18 @@ public class SparkDedupTest implements Serializable {
|
||||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||||
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||||
assertFalse(dups.contains(r.getTarget()));
|
assertTrue(dups.contains(r.getTarget()));
|
||||||
});
|
});
|
||||||
|
|
||||||
final List<Relation> mergedIn = pubs
|
final List<Relation> mergedIn = pubs
|
||||||
.filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
.filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
||||||
.collectAsList();
|
.collectAsList();
|
||||||
assertEquals(1, mergedIn.size());
|
assertEquals(3, mergedIn.size());
|
||||||
mergedIn.forEach(r -> {
|
mergedIn.forEach(r -> {
|
||||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||||
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||||
assertFalse(dups.contains(r.getSource()));
|
assertTrue(dups.contains(r.getSource()));
|
||||||
});
|
});
|
||||||
|
|
||||||
System.out.println("orgs_mergerel = " + orgs_mergerel);
|
System.out.println("orgs_mergerel = " + orgs_mergerel);
|
||||||
|
@ -473,8 +473,8 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("orp_mergerel = " + orp_mergerel);
|
System.out.println("orp_mergerel = " + orp_mergerel);
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(1278, orgs_mergerel);
|
assertEquals(1268, orgs_mergerel);
|
||||||
assertEquals(1158, pubs.count());
|
assertEquals(1156, pubs.count());
|
||||||
assertEquals(292, sw_mergerel);
|
assertEquals(292, sw_mergerel);
|
||||||
assertEquals(476, ds_mergerel);
|
assertEquals(476, ds_mergerel);
|
||||||
assertEquals(742, orp_mergerel);
|
assertEquals(742, orp_mergerel);
|
||||||
|
|
|
@ -241,6 +241,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
|
|
||||||
verifyRoot_case_1(roots, pubs);
|
verifyRoot_case_1(roots, pubs);
|
||||||
verifyRoot_case_2(roots, pubs);
|
verifyRoot_case_2(roots, pubs);
|
||||||
|
verifyRoot_case_3(roots, pubs);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
||||||
|
@ -321,6 +322,34 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void verifyRoot_case_3(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
||||||
|
Publication root = roots
|
||||||
|
.filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'")
|
||||||
|
.first();
|
||||||
|
assertNotNull(root);
|
||||||
|
|
||||||
|
Publication pivot_duplicate = pubs
|
||||||
|
.filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'")
|
||||||
|
.first();
|
||||||
|
|
||||||
|
assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||||
|
|
||||||
|
Set<String> dups_cf = pubs
|
||||||
|
.collectAsList()
|
||||||
|
.stream()
|
||||||
|
.flatMap(p -> p.getCollectedfrom().stream())
|
||||||
|
.map(KeyValue::getValue)
|
||||||
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
|
||||||
|
Set<String> root_cf = root
|
||||||
|
.getCollectedfrom()
|
||||||
|
.stream()
|
||||||
|
.map(KeyValue::getValue)
|
||||||
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
|
||||||
|
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(6)
|
@Order(6)
|
||||||
void updateEntityTest() throws Exception {
|
void updateEntityTest() throws Exception {
|
||||||
|
|
|
@ -143,9 +143,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
||||||
"--graphBasePath", graphInputPath,
|
"--graphBasePath", graphInputPath,
|
||||||
"--actionSetId", testActionSetId,
|
"--actionSetId", testActionSetId,
|
||||||
"--isLookUpUrl", "lookupurl",
|
"--isLookUpUrl", "lookupurl",
|
||||||
"--workingPath", workingPath,
|
"--workingPath", workingPath
|
||||||
"--hiveMetastoreUris", "none",
|
|
||||||
"--pivotHistoryDatabase", ""
|
|
||||||
}), spark)
|
}), spark)
|
||||||
.run(isLookUpService);
|
.run(isLookUpService);
|
||||||
|
|
||||||
|
@ -155,7 +153,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
||||||
.as(Encoders.bean(Relation.class));
|
.as(Encoders.bean(Relation.class));
|
||||||
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
4, merges
|
3, merges
|
||||||
.filter("relclass == 'isMergedIn'")
|
.filter("relclass == 'isMergedIn'")
|
||||||
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
|
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
|
||||||
.distinct()
|
.distinct()
|
||||||
|
@ -180,7 +178,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
||||||
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
||||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||||
|
|
||||||
assertEquals(4, roots.count());
|
assertEquals(3, roots.count());
|
||||||
|
|
||||||
final Dataset<Publication> pubs = spark
|
final Dataset<Publication> pubs = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -197,7 +195,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
.get(0);
|
.get(0);
|
||||||
|
|
||||||
assertEquals("2022-01-01", root.getDateofacceptance().getValue());
|
assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue());
|
||||||
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
||||||
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
|
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
|
||||||
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||||
|
|
|
@ -168,7 +168,7 @@ public class SparkStatsTest implements Serializable {
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(412, orgs_blocks);
|
assertEquals(414, orgs_blocks);
|
||||||
assertEquals(221, pubs_blocks);
|
assertEquals(221, pubs_blocks);
|
||||||
assertEquals(134, sw_blocks);
|
assertEquals(134, sw_blocks);
|
||||||
assertEquals(196, ds_blocks);
|
assertEquals(196, ds_blocks);
|
||||||
|
|
|
@ -73,6 +73,12 @@
|
||||||
"name": "Irish Nephrology Society",
|
"name": "Irish Nephrology Society",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "100011062",
|
||||||
|
"uri": "http://dx.doi.org/10.13039/100011062",
|
||||||
|
"name": "Asian Spinal Cord Network",
|
||||||
|
"synonym": []
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "100011096",
|
"id": "100011096",
|
||||||
"uri": "http://dx.doi.org/10.13039/100011096",
|
"uri": "http://dx.doi.org/10.13039/100011096",
|
||||||
|
@ -217,6 +223,12 @@
|
||||||
"name": "Global Brain Health Institute",
|
"name": "Global Brain Health Institute",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "100015776",
|
||||||
|
"uri": "http://dx.doi.org/10.13039/100015776",
|
||||||
|
"name": "Health and Social Care Board",
|
||||||
|
"synonym": []
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "100015992",
|
"id": "100015992",
|
||||||
"uri": "http://dx.doi.org/10.13039/100015992",
|
"uri": "http://dx.doi.org/10.13039/100015992",
|
||||||
|
@ -391,6 +403,18 @@
|
||||||
"name": "Irish Hospice Foundation",
|
"name": "Irish Hospice Foundation",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "501100001596",
|
||||||
|
"uri": "http://dx.doi.org/10.13039/501100001596",
|
||||||
|
"name": "Irish Research Council for Science, Engineering and Technology",
|
||||||
|
"synonym": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "501100001597",
|
||||||
|
"uri": "http://dx.doi.org/10.13039/501100001597",
|
||||||
|
"name": "Irish Research Council for the Humanities and Social Sciences",
|
||||||
|
"synonym": []
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "501100001598",
|
"id": "501100001598",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100001598",
|
"uri": "http://dx.doi.org/10.13039/501100001598",
|
||||||
|
@ -491,7 +515,7 @@
|
||||||
"id": "501100002081",
|
"id": "501100002081",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100002081",
|
"uri": "http://dx.doi.org/10.13039/501100002081",
|
||||||
"name": "Irish Research Council",
|
"name": "Irish Research Council",
|
||||||
"synonym": ["501100001596", "501100001597"]
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "501100002736",
|
"id": "501100002736",
|
||||||
|
|
|
@ -560,15 +560,7 @@ case object Crossref2Oaf {
|
||||||
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
|
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
|
||||||
"10.13039/501100013589" | "10.13039/501100000271" =>
|
"10.13039/501100013589" | "10.13039/501100000271" =>
|
||||||
generateSimpleRelationFromAward(funder, "ukri________", a => a)
|
generateSimpleRelationFromAward(funder, "ukri________", a => a)
|
||||||
//HFRI
|
|
||||||
case "10.13039/501100013209" =>
|
|
||||||
generateSimpleRelationFromAward(funder, "hfri________", a => a)
|
|
||||||
val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
|
|
||||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
|
||||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
|
||||||
//ERASMUS+
|
|
||||||
case "10.13039/501100010790" =>
|
|
||||||
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
|
||||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,13 +13,13 @@ public class CommunityContentprovider {
|
||||||
private String openaireId;
|
private String openaireId;
|
||||||
private SelectionConstraints selectioncriteria;
|
private SelectionConstraints selectioncriteria;
|
||||||
|
|
||||||
private Boolean enabled;
|
private String enabled;
|
||||||
|
|
||||||
public Boolean getEnabled() {
|
public String getEnabled() {
|
||||||
return enabled;
|
return enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setEnabled(Boolean enabled) {
|
public void setEnabled(String enabled) {
|
||||||
this.enabled = enabled;
|
this.enabled = enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,8 +53,6 @@ public class Constraints implements Serializable {
|
||||||
|
|
||||||
for (Constraint sc : constraint) {
|
for (Constraint sc : constraint) {
|
||||||
boolean verified = false;
|
boolean verified = false;
|
||||||
if (!param.containsKey(sc.getField()))
|
|
||||||
return false;
|
|
||||||
for (String value : param.get(sc.getField())) {
|
for (String value : param.get(sc.getField())) {
|
||||||
if (sc.verifyCriteria(value.trim())) {
|
if (sc.verifyCriteria(value.trim())) {
|
||||||
verified = true;
|
verified = true;
|
||||||
|
|
|
@ -130,7 +130,6 @@ public class ResultTagger implements Serializable {
|
||||||
// log.info("Remove constraints for " + communityId);
|
// log.info("Remove constraints for " + communityId);
|
||||||
if (conf.getRemoveConstraintsMap().keySet().contains(communityId) &&
|
if (conf.getRemoveConstraintsMap().keySet().contains(communityId) &&
|
||||||
conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
||||||
!conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() &&
|
|
||||||
conf
|
conf
|
||||||
.getRemoveConstraintsMap()
|
.getRemoveConstraintsMap()
|
||||||
.get(communityId)
|
.get(communityId)
|
||||||
|
@ -162,30 +161,29 @@ public class ResultTagger implements Serializable {
|
||||||
|
|
||||||
// Tagging for datasource
|
// Tagging for datasource
|
||||||
final Set<String> datasources = new HashSet<>();
|
final Set<String> datasources = new HashSet<>();
|
||||||
final Set<String> cfhb = new HashSet<>();
|
final Set<String> collfrom = new HashSet<>();
|
||||||
final Set<String> hostdby = new HashSet<>();
|
final Set<String> hostdby = new HashSet<>();
|
||||||
|
|
||||||
if (Objects.nonNull(result.getInstance())) {
|
if (Objects.nonNull(result.getInstance())) {
|
||||||
for (Instance i : result.getInstance()) {
|
for (Instance i : result.getInstance()) {
|
||||||
if (Objects.nonNull(i.getCollectedfrom()) && Objects.nonNull(i.getCollectedfrom().getKey())) {
|
if (Objects.nonNull(i.getCollectedfrom()) && Objects.nonNull(i.getCollectedfrom().getKey())) {
|
||||||
cfhb.add(i.getCollectedfrom().getKey());
|
collfrom.add(i.getCollectedfrom().getKey());
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(i.getHostedby()) && Objects.nonNull(i.getHostedby().getKey())) {
|
if (Objects.nonNull(i.getHostedby()) && Objects.nonNull(i.getHostedby().getKey())) {
|
||||||
cfhb.add(i.getHostedby().getKey());
|
|
||||||
hostdby.add(i.getHostedby().getKey());
|
hostdby.add(i.getHostedby().getKey());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cfhb
|
collfrom
|
||||||
.forEach(
|
.forEach(
|
||||||
dsId -> datasources
|
dsId -> datasources
|
||||||
.addAll(
|
.addAll(
|
||||||
conf.getCommunityForDatasource(dsId, param)));
|
conf.getCommunityForDatasource(dsId, param)));
|
||||||
hostdby.forEach(dsId -> {
|
hostdby.forEach(dsId -> {
|
||||||
// datasources
|
datasources
|
||||||
// .addAll(
|
.addAll(
|
||||||
// conf.getCommunityForDatasource(dsId, param));
|
conf.getCommunityForDatasource(dsId, param));
|
||||||
if (conf.isEoscDatasource(dsId)) {
|
if (conf.isEoscDatasource(dsId)) {
|
||||||
datasources.add("eosc");
|
datasources.add("eosc");
|
||||||
}
|
}
|
||||||
|
@ -228,7 +226,6 @@ public class ResultTagger implements Serializable {
|
||||||
.forEach(communityId -> {
|
.forEach(communityId -> {
|
||||||
if (!removeCommunities.contains(communityId) &&
|
if (!removeCommunities.contains(communityId) &&
|
||||||
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
||||||
!conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() &&
|
|
||||||
conf
|
conf
|
||||||
.getSelectionConstraintsMap()
|
.getSelectionConstraintsMap()
|
||||||
.get(communityId)
|
.get(communityId)
|
||||||
|
|
|
@ -33,8 +33,6 @@ public class SelectionConstraints implements Serializable {
|
||||||
|
|
||||||
// Constraints in or
|
// Constraints in or
|
||||||
public boolean verifyCriteria(final Map<String, List<String>> param) {
|
public boolean verifyCriteria(final Map<String, List<String>> param) {
|
||||||
if (criteria.isEmpty())
|
|
||||||
return true;
|
|
||||||
for (Constraints selc : criteria) {
|
for (Constraints selc : criteria) {
|
||||||
if (selc.verifyCriteria(param)) {
|
if (selc.verifyCriteria(param)) {
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -14,7 +14,6 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.Row;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -85,12 +84,11 @@ public class SparkCountryPropagationJob {
|
||||||
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
|
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
|
||||||
|
|
||||||
log.info("Reading prepared info: {}", preparedInfoPath);
|
log.info("Reading prepared info: {}", preparedInfoPath);
|
||||||
final Dataset<Row> preparedInfoRaw = spark
|
Dataset<ResultCountrySet> prepared = spark
|
||||||
.read()
|
.read()
|
||||||
.json(preparedInfoPath);
|
.json(preparedInfoPath)
|
||||||
|
.as(Encoders.bean(ResultCountrySet.class));
|
||||||
|
|
||||||
if (!preparedInfoRaw.isEmpty()) {
|
|
||||||
final Dataset<ResultCountrySet> prepared = preparedInfoRaw.as(Encoders.bean(ResultCountrySet.class));
|
|
||||||
res
|
res
|
||||||
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
|
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
|
||||||
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
|
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
|
||||||
|
@ -98,13 +96,7 @@ public class SparkCountryPropagationJob {
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
} else {
|
|
||||||
res
|
|
||||||
.write()
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.json(outputPath);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue