Compare commits
61 Commits
main
...
affiliatio
Author | SHA1 | Date |
---|---|---|
Miriam Baglioni | 420f43fc2f | |
Miriam Baglioni | 595883fef0 | |
Miriam Baglioni | f8988af98d | |
Giambattista Bloisi | 56b05cde0b | |
Claudio Atzori | 62ff843334 | |
Claudio Atzori | d5867a1992 | |
Claudio Atzori | e5df68772d | |
Miriam Baglioni | 7e6d12fa77 | |
Miriam Baglioni | 191fc3a461 | |
Claudio Atzori | 10696f2a44 | |
Claudio Atzori | 5734b80861 | |
Antonis Lempesis | f3c179658a | |
Miriam Baglioni | b18ad035c1 | |
Miriam Baglioni | e430826e00 | |
Giambattista Bloisi | c45cae447a | |
Claudio Atzori | 3fcafc7ed6 | |
Miriam Baglioni | 599e56dbc6 | |
Claudio Atzori | 6397141e56 | |
Claudio Atzori | e354f9853a | |
Claudio Atzori | 535a7b99f1 | |
Sandro La Bruzzo | 6a097abc89 | |
Michele Artini | 9754521847 | |
Michele Artini | 54f8b4da39 | |
Miriam Baglioni | 4d3e079590 | |
Michele Artini | e941adbe2b | |
Michele Artini | fdbe629f49 | |
Antonis Lempesis | 619aa34a15 | |
Antonis Lempesis | dbea7a4072 | |
Antonis Lempesis | c9241dba0d | |
Michele Artini | 755a5aefcf | |
Michele Artini | db6f137cf9 | |
Serafeim Chatzopoulos | 50401a872f | |
Antonis Lempesis | 37ad259296 | |
Antonis Lempesis | b64c144abf | |
Serafeim Chatzopoulos | 37c04cbad7 | |
Miriam Baglioni | 468f2aa5a5 | |
Miriam Baglioni | 89fcf4086c | |
Miriam Baglioni | 8c185a7b1a | |
Miriam Baglioni | 985ca15264 | |
Antonis Lempesis | d0590e0e49 | |
Antonis Lempesis | 7d2c0a3723 | |
Lampros Smyrnaios | e9686365a2 | |
Lampros Smyrnaios | ce0aee21cc | |
Lampros Smyrnaios | 7b7dd32ad5 | |
Lampros Smyrnaios | 7ce051d766 | |
Lampros Smyrnaios | aa4d7d5e20 | |
Lampros Smyrnaios | 54e11b6a43 | |
Lampros Smyrnaios | fe2275a9b0 | |
Lampros Smyrnaios | a644a6f4fe | |
Lampros Smyrnaios | 888637773c | |
Lampros Smyrnaios | e0ac494859 | |
Lampros Smyrnaios | 3c17183d10 | |
Lampros Smyrnaios | 69a9ac7393 | |
Lampros Smyrnaios | 342223f75c | |
Lampros Smyrnaios | 2616971e2b | |
Lampros Smyrnaios | ba533d9f34 | |
Lampros Smyrnaios | d46b78b659 | |
Lampros Smyrnaios | 6f2ebb2a52 | |
Lampros Smyrnaios | ca091c0f1e | |
Lampros Smyrnaios | 0b897f2f66 | |
Lampros Smyrnaios | db33f7727c |
|
@ -7,12 +7,12 @@ import java.sql.*;
|
|||
import java.util.function.Consumer;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class DbClient implements Closeable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DbClient.class);
|
||||
private static final Log log = LogFactory.getLog(DbClient.class);
|
||||
|
||||
private final Connection connection;
|
||||
|
||||
|
@ -37,8 +37,6 @@ public class DbClient implements Closeable {
|
|||
try (final Statement stmt = connection.createStatement()) {
|
||||
stmt.setFetchSize(100);
|
||||
|
||||
log.info("running SQL:\n\n{}\n\n", sql);
|
||||
|
||||
try (final ResultSet rs = stmt.executeQuery(sql)) {
|
||||
while (rs.next()) {
|
||||
consumer.accept(rs);
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import okhttp3.MediaType;
|
||||
import okhttp3.RequestBody;
|
||||
import okhttp3.internal.Util;
|
||||
import okio.BufferedSink;
|
||||
import okio.Okio;
|
||||
import okio.Source;
|
||||
|
||||
public class InputStreamRequestBody extends RequestBody {
|
||||
|
||||
private final InputStream inputStream;
|
||||
private final MediaType mediaType;
|
||||
private final long lenght;
|
||||
|
||||
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
|
||||
|
||||
return new InputStreamRequestBody(inputStream, mediaType, len);
|
||||
}
|
||||
|
||||
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
|
||||
this.inputStream = inputStream;
|
||||
this.mediaType = mediaType;
|
||||
this.lenght = len;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MediaType contentType() {
|
||||
return mediaType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long contentLength() {
|
||||
|
||||
return lenght;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(BufferedSink sink) throws IOException {
|
||||
Source source = null;
|
||||
try {
|
||||
source = Okio.source(inputStream);
|
||||
sink.writeAll(source);
|
||||
} finally {
|
||||
Util.closeQuietly(source);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api;
|
||||
|
||||
public class MissingConceptDoiException extends Throwable {
|
||||
public MissingConceptDoiException(String message) {
|
||||
super(message);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,363 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.IOException;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.apache.http.entity.ContentType;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
|
||||
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
|
||||
import okhttp3.*;
|
||||
|
||||
public class ZenodoAPIClient implements Serializable {
|
||||
|
||||
String urlString;
|
||||
String bucket;
|
||||
|
||||
String deposition_id;
|
||||
String access_token;
|
||||
|
||||
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
|
||||
|
||||
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
|
||||
|
||||
public String getUrlString() {
|
||||
return urlString;
|
||||
}
|
||||
|
||||
public void setUrlString(String urlString) {
|
||||
this.urlString = urlString;
|
||||
}
|
||||
|
||||
public String getBucket() {
|
||||
return bucket;
|
||||
}
|
||||
|
||||
public void setBucket(String bucket) {
|
||||
this.bucket = bucket;
|
||||
}
|
||||
|
||||
public void setDeposition_id(String deposition_id) {
|
||||
this.deposition_id = deposition_id;
|
||||
}
|
||||
|
||||
public ZenodoAPIClient(String urlString, String access_token) {
|
||||
|
||||
this.urlString = urlString;
|
||||
this.access_token = access_token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
|
||||
*
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
*/
|
||||
public int newDeposition() throws IOException {
|
||||
String json = "{}";
|
||||
|
||||
URL url = new URL(urlString);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setRequestMethod("POST");
|
||||
conn.setDoOutput(true);
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
byte[] input = json.getBytes("utf-8");
|
||||
os.write(input, 0, input.length);
|
||||
}
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
conn.disconnect();
|
||||
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
|
||||
this.bucket = newSubmission.getLinks().getBucket();
|
||||
this.deposition_id = newSubmission.getId();
|
||||
|
||||
return responseCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Upload files in Zenodo.
|
||||
*
|
||||
* @param is the inputStream for the file to upload
|
||||
* @param file_name the name of the file as it will appear on Zenodo
|
||||
* @return the response code
|
||||
*/
|
||||
public int uploadIS(InputStream is, String file_name) throws IOException {
|
||||
|
||||
URL url = new URL(bucket + "/" + file_name);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("PUT");
|
||||
|
||||
byte[] buf = new byte[8192];
|
||||
int length;
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
while ((length = is.read(buf)) != -1) {
|
||||
os.write(buf, 0, length);
|
||||
}
|
||||
|
||||
}
|
||||
int responseCode = conn.getResponseCode();
|
||||
if (!checkOKStatus(responseCode)) {
|
||||
throw new IOException("Unexpected code " + responseCode + getBody(conn));
|
||||
}
|
||||
|
||||
return responseCode;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private String getBody(HttpURLConnection conn) throws IOException {
|
||||
String body = "{}";
|
||||
try (BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
|
||||
StringBuilder response = new StringBuilder();
|
||||
String responseLine = null;
|
||||
while ((responseLine = br.readLine()) != null) {
|
||||
response.append(responseLine.trim());
|
||||
}
|
||||
|
||||
body = response.toString();
|
||||
|
||||
}
|
||||
return body;
|
||||
}
|
||||
|
||||
/**
|
||||
* Associates metadata information to the current deposition
|
||||
*
|
||||
* @param metadata the metadata
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
*/
|
||||
public int sendMretadata(String metadata) throws IOException {
|
||||
|
||||
URL url = new URL(urlString + "/" + deposition_id);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("PUT");
|
||||
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
byte[] input = metadata.getBytes("utf-8");
|
||||
os.write(input, 0, input.length);
|
||||
|
||||
}
|
||||
|
||||
final int responseCode = conn.getResponseCode();
|
||||
conn.disconnect();
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + getBody(conn));
|
||||
|
||||
return responseCode;
|
||||
|
||||
}
|
||||
|
||||
private boolean checkOKStatus(int responseCode) {
|
||||
|
||||
if (HttpURLConnection.HTTP_OK != responseCode ||
|
||||
HttpURLConnection.HTTP_CREATED != responseCode)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* To publish the current deposition. It works for both new deposition or new version of an old deposition
|
||||
*
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
*/
|
||||
@Deprecated
|
||||
public int publish() throws IOException {
|
||||
|
||||
String json = "{}";
|
||||
|
||||
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
|
||||
|
||||
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
|
||||
|
||||
Request request = new Request.Builder()
|
||||
.url(urlString + "/" + deposition_id + "/actions/publish")
|
||||
.addHeader("Authorization", "Bearer " + access_token)
|
||||
.post(body)
|
||||
.build();
|
||||
|
||||
try (Response response = httpClient.newCall(request).execute()) {
|
||||
|
||||
if (!response.isSuccessful())
|
||||
throw new IOException("Unexpected code " + response + response.body().string());
|
||||
|
||||
return response.code();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
|
||||
* for the new version.
|
||||
*
|
||||
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
|
||||
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
||||
* concept_rec_id = 656930
|
||||
* @return response code
|
||||
*/
|
||||
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
||||
setDepositionId(concept_rec_id, 1);
|
||||
String json = "{}";
|
||||
|
||||
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("POST");
|
||||
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
byte[] input = json.getBytes("utf-8");
|
||||
os.write(input, 0, input.length);
|
||||
|
||||
}
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
|
||||
conn.disconnect();
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||
String latest_draft = zenodoModel.getLinks().getLatest_draft();
|
||||
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
|
||||
bucket = getBucket(latest_draft);
|
||||
|
||||
return responseCode;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* To finish uploading a version or new deposition not published
|
||||
* It sets the deposition_id and the bucket to be used
|
||||
*
|
||||
*
|
||||
* @param deposition_id the deposition id of the not yet published upload
|
||||
* concept_rec_id = 656930
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
* @throws MissingConceptDoiException
|
||||
*/
|
||||
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
|
||||
|
||||
this.deposition_id = deposition_id;
|
||||
|
||||
String json = "{}";
|
||||
|
||||
URL url = new URL(urlString + "/" + deposition_id);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setRequestMethod("POST");
|
||||
conn.setDoOutput(true);
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
byte[] input = json.getBytes("utf-8");
|
||||
os.write(input, 0, input.length);
|
||||
}
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
conn.disconnect();
|
||||
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||
bucket = zenodoModel.getLinks().getBucket();
|
||||
|
||||
return responseCode;
|
||||
|
||||
}
|
||||
|
||||
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
|
||||
|
||||
ZenodoModelList zenodoModelList = new Gson()
|
||||
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
|
||||
|
||||
for (ZenodoModel zm : zenodoModelList) {
|
||||
if (zm.getConceptrecid().equals(concept_rec_id)) {
|
||||
deposition_id = zm.getId();
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (zenodoModelList.size() == 0)
|
||||
throw new MissingConceptDoiException(
|
||||
"The concept record id specified was missing in the list of depositions");
|
||||
setDepositionId(concept_rec_id, page + 1);
|
||||
|
||||
}
|
||||
|
||||
private String getPrevDepositions(String page) throws IOException {
|
||||
|
||||
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
|
||||
urlBuilder.addQueryParameter("page", page);
|
||||
|
||||
URL url = new URL(urlBuilder.build().toString());
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("GET");
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
|
||||
conn.disconnect();
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
return body;
|
||||
|
||||
}
|
||||
|
||||
private String getBucket(String inputUurl) throws IOException {
|
||||
|
||||
URL url = new URL(inputUurl);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("GET");
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
|
||||
conn.disconnect();
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||
|
||||
return zenodoModel.getLinks().getBucket();
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
public class Community {
|
||||
private String identifier;
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
public class Creator {
|
||||
private String affiliation;
|
||||
private String name;
|
||||
private String orcid;
|
||||
|
||||
public String getAffiliation() {
|
||||
return affiliation;
|
||||
}
|
||||
|
||||
public void setAffiliation(String affiliation) {
|
||||
this.affiliation = affiliation;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getOrcid() {
|
||||
return orcid;
|
||||
}
|
||||
|
||||
public void setOrcid(String orcid) {
|
||||
this.orcid = orcid;
|
||||
}
|
||||
|
||||
public static Creator newInstance(String name, String affiliation, String orcid) {
|
||||
Creator c = new Creator();
|
||||
if (name != null) {
|
||||
c.name = name;
|
||||
}
|
||||
if (affiliation != null) {
|
||||
c.affiliation = affiliation;
|
||||
}
|
||||
if (orcid != null) {
|
||||
c.orcid = orcid;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class File implements Serializable {
|
||||
private String checksum;
|
||||
private String filename;
|
||||
private long filesize;
|
||||
private String id;
|
||||
|
||||
public String getChecksum() {
|
||||
return checksum;
|
||||
}
|
||||
|
||||
public void setChecksum(String checksum) {
|
||||
this.checksum = checksum;
|
||||
}
|
||||
|
||||
public String getFilename() {
|
||||
return filename;
|
||||
}
|
||||
|
||||
public void setFilename(String filename) {
|
||||
this.filename = filename;
|
||||
}
|
||||
|
||||
public long getFilesize() {
|
||||
return filesize;
|
||||
}
|
||||
|
||||
public void setFilesize(long filesize) {
|
||||
this.filesize = filesize;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Grant implements Serializable {
|
||||
private String id;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public static Grant newInstance(String id) {
|
||||
Grant g = new Grant();
|
||||
g.id = id;
|
||||
|
||||
return g;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Links implements Serializable {
|
||||
|
||||
private String bucket;
|
||||
|
||||
private String discard;
|
||||
|
||||
private String edit;
|
||||
private String files;
|
||||
private String html;
|
||||
private String latest_draft;
|
||||
private String latest_draft_html;
|
||||
private String publish;
|
||||
|
||||
private String self;
|
||||
|
||||
public String getBucket() {
|
||||
return bucket;
|
||||
}
|
||||
|
||||
public void setBucket(String bucket) {
|
||||
this.bucket = bucket;
|
||||
}
|
||||
|
||||
public String getDiscard() {
|
||||
return discard;
|
||||
}
|
||||
|
||||
public void setDiscard(String discard) {
|
||||
this.discard = discard;
|
||||
}
|
||||
|
||||
public String getEdit() {
|
||||
return edit;
|
||||
}
|
||||
|
||||
public void setEdit(String edit) {
|
||||
this.edit = edit;
|
||||
}
|
||||
|
||||
public String getFiles() {
|
||||
return files;
|
||||
}
|
||||
|
||||
public void setFiles(String files) {
|
||||
this.files = files;
|
||||
}
|
||||
|
||||
public String getHtml() {
|
||||
return html;
|
||||
}
|
||||
|
||||
public void setHtml(String html) {
|
||||
this.html = html;
|
||||
}
|
||||
|
||||
public String getLatest_draft() {
|
||||
return latest_draft;
|
||||
}
|
||||
|
||||
public void setLatest_draft(String latest_draft) {
|
||||
this.latest_draft = latest_draft;
|
||||
}
|
||||
|
||||
public String getLatest_draft_html() {
|
||||
return latest_draft_html;
|
||||
}
|
||||
|
||||
public void setLatest_draft_html(String latest_draft_html) {
|
||||
this.latest_draft_html = latest_draft_html;
|
||||
}
|
||||
|
||||
public String getPublish() {
|
||||
return publish;
|
||||
}
|
||||
|
||||
public void setPublish(String publish) {
|
||||
this.publish = publish;
|
||||
}
|
||||
|
||||
public String getSelf() {
|
||||
return self;
|
||||
}
|
||||
|
||||
public void setSelf(String self) {
|
||||
this.self = self;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Metadata implements Serializable {
|
||||
|
||||
private String access_right;
|
||||
private List<Community> communities;
|
||||
private List<Creator> creators;
|
||||
private String description;
|
||||
private String doi;
|
||||
private List<Grant> grants;
|
||||
private List<String> keywords;
|
||||
private String language;
|
||||
private String license;
|
||||
private PrereserveDoi prereserve_doi;
|
||||
private String publication_date;
|
||||
private List<String> references;
|
||||
private List<RelatedIdentifier> related_identifiers;
|
||||
private String title;
|
||||
private String upload_type;
|
||||
private String version;
|
||||
|
||||
public String getUpload_type() {
|
||||
return upload_type;
|
||||
}
|
||||
|
||||
public void setUpload_type(String upload_type) {
|
||||
this.upload_type = upload_type;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(String version) {
|
||||
this.version = version;
|
||||
}
|
||||
|
||||
public String getAccess_right() {
|
||||
return access_right;
|
||||
}
|
||||
|
||||
public void setAccess_right(String access_right) {
|
||||
this.access_right = access_right;
|
||||
}
|
||||
|
||||
public List<Community> getCommunities() {
|
||||
return communities;
|
||||
}
|
||||
|
||||
public void setCommunities(List<Community> communities) {
|
||||
this.communities = communities;
|
||||
}
|
||||
|
||||
public List<Creator> getCreators() {
|
||||
return creators;
|
||||
}
|
||||
|
||||
public void setCreators(List<Creator> creators) {
|
||||
this.creators = creators;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public List<Grant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
|
||||
public void setGrants(List<Grant> grants) {
|
||||
this.grants = grants;
|
||||
}
|
||||
|
||||
public List<String> getKeywords() {
|
||||
return keywords;
|
||||
}
|
||||
|
||||
public void setKeywords(List<String> keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public String getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
||||
public void setLicense(String license) {
|
||||
this.license = license;
|
||||
}
|
||||
|
||||
public PrereserveDoi getPrereserve_doi() {
|
||||
return prereserve_doi;
|
||||
}
|
||||
|
||||
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
|
||||
this.prereserve_doi = prereserve_doi;
|
||||
}
|
||||
|
||||
public String getPublication_date() {
|
||||
return publication_date;
|
||||
}
|
||||
|
||||
public void setPublication_date(String publication_date) {
|
||||
this.publication_date = publication_date;
|
||||
}
|
||||
|
||||
public List<String> getReferences() {
|
||||
return references;
|
||||
}
|
||||
|
||||
public void setReferences(List<String> references) {
|
||||
this.references = references;
|
||||
}
|
||||
|
||||
public List<RelatedIdentifier> getRelated_identifiers() {
|
||||
return related_identifiers;
|
||||
}
|
||||
|
||||
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
|
||||
this.related_identifiers = related_identifiers;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class PrereserveDoi implements Serializable {
|
||||
private String doi;
|
||||
private String recid;
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getRecid() {
|
||||
return recid;
|
||||
}
|
||||
|
||||
public void setRecid(String recid) {
|
||||
this.recid = recid;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class RelatedIdentifier implements Serializable {
|
||||
private String identifier;
|
||||
private String relation;
|
||||
private String resource_type;
|
||||
private String scheme;
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
public String getRelation() {
|
||||
return relation;
|
||||
}
|
||||
|
||||
public void setRelation(String relation) {
|
||||
this.relation = relation;
|
||||
}
|
||||
|
||||
public String getResource_type() {
|
||||
return resource_type;
|
||||
}
|
||||
|
||||
public void setResource_type(String resource_type) {
|
||||
this.resource_type = resource_type;
|
||||
}
|
||||
|
||||
public String getScheme() {
|
||||
return scheme;
|
||||
}
|
||||
|
||||
public void setScheme(String scheme) {
|
||||
this.scheme = scheme;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class ZenodoModel implements Serializable {
|
||||
|
||||
private String conceptrecid;
|
||||
private String created;
|
||||
|
||||
private List<File> files;
|
||||
private String id;
|
||||
private Links links;
|
||||
private Metadata metadata;
|
||||
private String modified;
|
||||
private String owner;
|
||||
private String record_id;
|
||||
private String state;
|
||||
private boolean submitted;
|
||||
private String title;
|
||||
|
||||
public String getConceptrecid() {
|
||||
return conceptrecid;
|
||||
}
|
||||
|
||||
public void setConceptrecid(String conceptrecid) {
|
||||
this.conceptrecid = conceptrecid;
|
||||
}
|
||||
|
||||
public String getCreated() {
|
||||
return created;
|
||||
}
|
||||
|
||||
public void setCreated(String created) {
|
||||
this.created = created;
|
||||
}
|
||||
|
||||
public List<File> getFiles() {
|
||||
return files;
|
||||
}
|
||||
|
||||
public void setFiles(List<File> files) {
|
||||
this.files = files;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public Links getLinks() {
|
||||
return links;
|
||||
}
|
||||
|
||||
public void setLinks(Links links) {
|
||||
this.links = links;
|
||||
}
|
||||
|
||||
public Metadata getMetadata() {
|
||||
return metadata;
|
||||
}
|
||||
|
||||
public void setMetadata(Metadata metadata) {
|
||||
this.metadata = metadata;
|
||||
}
|
||||
|
||||
public String getModified() {
|
||||
return modified;
|
||||
}
|
||||
|
||||
public void setModified(String modified) {
|
||||
this.modified = modified;
|
||||
}
|
||||
|
||||
public String getOwner() {
|
||||
return owner;
|
||||
}
|
||||
|
||||
public void setOwner(String owner) {
|
||||
this.owner = owner;
|
||||
}
|
||||
|
||||
public String getRecord_id() {
|
||||
return record_id;
|
||||
}
|
||||
|
||||
public void setRecord_id(String record_id) {
|
||||
this.record_id = record_id;
|
||||
}
|
||||
|
||||
public String getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
public void setState(String state) {
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public boolean isSubmitted() {
|
||||
return submitted;
|
||||
}
|
||||
|
||||
public void setSubmitted(boolean submitted) {
|
||||
this.submitted = submitted;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class ZenodoModelList extends ArrayList<ZenodoModel> {
|
||||
}
|
|
@ -65,7 +65,13 @@ public class RunSQLSparkJob {
|
|||
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
|
||||
log.info("executing: {}", statement);
|
||||
long startTime = System.currentTimeMillis();
|
||||
try {
|
||||
spark.sql(statement).show();
|
||||
} catch (Exception e) {
|
||||
log.error("Error executing statement: {}", statement, e);
|
||||
System.err.println("Error executing statement: " + statement + "\n" + e);
|
||||
throw e;
|
||||
}
|
||||
log
|
||||
.info(
|
||||
"executed in {}",
|
||||
|
|
|
@ -119,7 +119,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.getContext()
|
||||
.stream()
|
||||
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
|
||||
.collect(Collectors.toCollection(ArrayList::new)));
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
return (T) res;
|
||||
} else {
|
||||
|
@ -1015,41 +1015,4 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.orElse(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements bad and ugly things that we should get rid of ASAP.
|
||||
*
|
||||
* @param value
|
||||
* @return
|
||||
* @param <T>
|
||||
*/
|
||||
public static <T extends Oaf> T dedicatedUglyHacks(T value) {
|
||||
if (value instanceof OafEntity) {
|
||||
if (value instanceof Result) {
|
||||
final Result r = (Result) value;
|
||||
|
||||
// Fix for AMS Acta
|
||||
Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.filter(
|
||||
i -> Optional
|
||||
.ofNullable(i.getHostedby())
|
||||
.map(KeyValue::getKey)
|
||||
.map(dsId -> dsId.equals("10|re3data_____::4cc76bed7ce2fb95fd8e7a2dfde16016"))
|
||||
.orElse(false)))
|
||||
.ifPresent(instance -> instance.forEach(i -> {
|
||||
if (Optional
|
||||
.ofNullable(i.getPid())
|
||||
.map(pid -> pid.stream().noneMatch(p -> p.getValue().startsWith("10.6092/unibo/amsacta")))
|
||||
.orElse(false)) {
|
||||
i.setHostedby(UNKNOWN_REPOSITORY);
|
||||
}
|
||||
}));
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -433,10 +433,7 @@ public class MergeUtils {
|
|||
|
||||
// merge datainfo for same context id
|
||||
merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
|
||||
ArrayList<DataInfo> di = new ArrayList<>();
|
||||
di.addAll(r.getDataInfo());
|
||||
di.addAll(l.getDataInfo());
|
||||
r.setDataInfo(di);
|
||||
r.getDataInfo().addAll(l.getDataInfo());
|
||||
return r;
|
||||
}));
|
||||
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Disabled
|
||||
class ZenodoAPIClientTest {
|
||||
|
||||
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
|
||||
private final String ACCESS_TOKEN = "";
|
||||
|
||||
private final String CONCEPT_REC_ID = "657113";
|
||||
|
||||
private final String depositionId = "674915";
|
||||
|
||||
@Test
|
||||
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
|
||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||
ACCESS_TOKEN);
|
||||
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
|
||||
|
||||
File file = new File(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
|
||||
.getPath());
|
||||
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
|
||||
|
||||
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
|
||||
|
||||
Assertions.assertEquals(200, client.sendMretadata(metadata));
|
||||
|
||||
Assertions.assertEquals(202, client.publish());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNewDeposition() throws IOException {
|
||||
|
||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||
ACCESS_TOKEN);
|
||||
Assertions.assertEquals(201, client.newDeposition());
|
||||
|
||||
File file = new File(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
|
||||
.getPath());
|
||||
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
|
||||
|
||||
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
|
||||
|
||||
Assertions.assertEquals(200, client.sendMretadata(metadata));
|
||||
|
||||
Assertions.assertEquals(202, client.publish());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
|
||||
|
||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||
ACCESS_TOKEN);
|
||||
|
||||
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
|
||||
|
||||
File file = new File(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/common/api/newVersion")
|
||||
.getPath());
|
||||
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
|
||||
|
||||
Assertions.assertEquals(202, client.publish());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
|
||||
|
||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||
ACCESS_TOKEN);
|
||||
|
||||
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
|
||||
|
||||
File file = new File(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/common/api/newVersion2")
|
||||
.getPath());
|
||||
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
|
||||
|
||||
Assertions.assertEquals(202, client.publish());
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -177,7 +177,7 @@ class OafMapperUtilsTest {
|
|||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
|
||||
assertEquals(
|
||||
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID,
|
||||
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
||||
((Result) MergeUtils
|
||||
.merge(p2, d1))
|
||||
.getResulttype()
|
||||
|
|
|
@ -6,7 +6,18 @@
|
|||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
<properties>
|
||||
<affro.release.version>1.0.0</affro.release.version>
|
||||
</properties>
|
||||
|
||||
<scm>
|
||||
<url>https://code-repo.d4science.org/mkallipo/affRo</url>
|
||||
<connection>scm:git:https://code-repo.d4science.org/mkallipo/affRo.git</connection>
|
||||
</scm>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
|
@ -43,6 +54,32 @@
|
|||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-scm-plugin</artifactId>
|
||||
<version>1.8.1</version>
|
||||
<configuration>
|
||||
<connectionType>connection</connectionType>
|
||||
<!--
|
||||
<scmVersionType>tag</scmVersionType>--><!-- 'branch' can also be provided here -->
|
||||
<!-- <scmVersion>${affro.release.version}</scmVersion>--><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
|
||||
|
||||
<scmVersionType>branch</scmVersionType><!-- 'branch' can also be provided here -->
|
||||
<scmVersion>openaire-workflow-ready</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
|
||||
<checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/affRo</checkoutDirectory>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>checkout-affro</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>checkout</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
|
|
@ -46,6 +46,9 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String backupPath = parser.get("backupPath");
|
||||
log.info("backupPath {}", backupPath);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
|
@ -53,11 +56,11 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
|
||||
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
||||
|
||||
ocr.doExtract(inputPath, outputPath, fileSystem);
|
||||
ocr.doExtract(inputPath, outputPath, backupPath, fileSystem);
|
||||
|
||||
}
|
||||
|
||||
private void doExtract(String inputPath, String outputPath, FileSystem fileSystem)
|
||||
private void doExtract(String inputPath, String outputPath, String backupPath, FileSystem fileSystem)
|
||||
throws IOException {
|
||||
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
|
@ -89,6 +92,7 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
}
|
||||
|
||||
}
|
||||
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -49,9 +49,6 @@ public class ReadCOCI implements Serializable {
|
|||
final String workingPath = parser.get("inputPath");
|
||||
log.info("workingPath {}", workingPath);
|
||||
|
||||
final String backupPath = parser.get("backupPath");
|
||||
log.info("backupPath {}", backupPath);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
|
@ -71,14 +68,12 @@ public class ReadCOCI implements Serializable {
|
|||
workingPath,
|
||||
fileSystem,
|
||||
outputPath,
|
||||
backupPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
||||
String outputPath,
|
||||
String backupPath,
|
||||
String delimiter) throws IOException {
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
.listFiles(
|
||||
|
@ -113,7 +108,7 @@ public class ReadCOCI implements Serializable {
|
|||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
|
||||
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
||||
fileSystem.delete(fileStatus.getPath());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -297,7 +297,7 @@ public class ExtractPerson implements Serializable {
|
|||
}
|
||||
|
||||
private static Relation getAffiliationRelation(Employment row) {
|
||||
String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
|
||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(row.getOrcid());
|
||||
String target = ROR_PREFIX
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
|
||||
List<KeyValue> properties = new ArrayList<>();
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
# --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties ---
|
||||
# dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
|
||||
# dhp.hadoop.frontend.user.name=ilias.kanellos
|
||||
# dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
|
||||
# dhp.hadoop.frontend.port.ssh=22
|
||||
# oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
|
||||
# jobTracker=yarnRM
|
||||
# nameNode=hdfs://nameservice1
|
||||
# oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
|
||||
# maven.executable=mvn
|
||||
|
||||
|
||||
# The above is given differently in an example I found online
|
||||
oozie.action.sharelib.for.spark=spark2
|
||||
oozieActionShareLibForSpark2=spark2
|
||||
spark2YarnHistoryServerAddress=http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
|
||||
spark2EventLogDir=/user/spark/spark2ApplicationHistory
|
||||
sparkSqlWarehouseDir=/user/hive/warehouse
|
||||
#hiveMetastoreUris=thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
|
||||
# This MAY avoid the no library used error
|
||||
oozie.use.system.libpath=true
|
||||
# Some stuff copied from openaire's jobs
|
||||
spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
|
||||
spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
|
||||
|
||||
# The following is needed as a property of a workflow
|
||||
wfAppPath=${oozieTopWfApplicationPath}
|
||||
|
||||
resumeFrom=Crossref
|
||||
|
||||
#OpenAlex input/output
|
||||
#resultFolder=/tmp/affro-results/oalex
|
||||
#inputFolder=/user/zeppelin/affiliations/raw_aff_string/2024-08
|
||||
|
||||
#Crossref input/output
|
||||
resultFolder=/tmp/affro-results/crossref
|
||||
inputFolder=/data/doiboost/crossref/crossref_unpack
|
||||
|
||||
#
|
||||
#crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
||||
#pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
||||
#openapcInputPath=/data/bip-affiliations/openapc-data.json
|
||||
#dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
||||
#
|
||||
#outputPath=/tmp/crossref-affiliations-output-v5
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,176 @@
|
|||
<workflow-app name="AffroAffiliations" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resumeFrom"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<decision name="resumeFrom">
|
||||
<switch>
|
||||
<case to="run-affro-on-iisdata">${wf:conf('resumeFrom') eq 'IIS'}</case>
|
||||
<case to="run-affro-on-crossref">${wf:conf('resumeFrom') eq 'Crossref'}</case>
|
||||
<default to="run-affro-on-oalexstrings"/>
|
||||
</switch>
|
||||
</decision>
|
||||
<action name="run-affro-on-iisdata">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Affiliations inference (Affro)</name>
|
||||
<jar>update_records.py</jar>
|
||||
|
||||
<spark-opts>
|
||||
--executor-cores=4
|
||||
--executor-memory=6G
|
||||
--driver-memory=15G
|
||||
--conf spark.executor.memoryOverhead=6G
|
||||
--conf spark.sql.shuffle.partitions=20000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/affro_test_example.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
||||
</spark-opts>
|
||||
|
||||
<arg>${resultFolder}</arg>
|
||||
|
||||
<file>${wfAppPath}/affRo/update_records.py#update_records.py</file>
|
||||
</spark>
|
||||
|
||||
<ok to="End" />
|
||||
<error to="Kill" />
|
||||
|
||||
</action>
|
||||
|
||||
<action name="run-affro-on-oalexstrings">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Affiliations inference (Affro)</name>
|
||||
<jar>strings.py</jar>
|
||||
|
||||
<spark-opts>
|
||||
--executor-cores=4
|
||||
--executor-memory=6G
|
||||
--driver-memory=15G
|
||||
--conf spark.executor.memoryOverhead=6G
|
||||
--conf spark.sql.shuffle.partitions=20000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
||||
</spark-opts>
|
||||
|
||||
<arg>${inputFolder}</arg>
|
||||
<arg>${resultFolder}</arg>
|
||||
|
||||
<file>${wfAppPath}/affRo/strings.py#strings.py</file>
|
||||
</spark>
|
||||
|
||||
<ok to="End" />
|
||||
<error to="Kill" />
|
||||
|
||||
</action>
|
||||
|
||||
<action name="run-affro-on-crossref">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Affiliations inference (Affro)</name>
|
||||
<jar>crossref.py</jar>
|
||||
|
||||
<spark-opts>
|
||||
--executor-cores=4
|
||||
--executor-memory=6G
|
||||
--driver-memory=15G
|
||||
--conf spark.executor.memoryOverhead=6G
|
||||
--conf spark.sql.shuffle.partitions=20000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
||||
</spark-opts>
|
||||
|
||||
<arg>${inputFolder}</arg>
|
||||
<arg>${resultFolder}</arg>
|
||||
|
||||
<file>${wfAppPath}/affRo/crossref.py#crossref.py</file>
|
||||
</spark>
|
||||
|
||||
<ok to="End" />
|
||||
<error to="Kill" />
|
||||
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -35,6 +35,5 @@ crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
|||
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
||||
openapcInputPath=/data/bip-affiliations/openapc-data.json
|
||||
dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
||||
webCrawlInputPath=/data/bip-affiliations/webCrawl/
|
||||
|
||||
outputPath=/tmp/crossref-affiliations-output-v5
|
||||
|
|
|
@ -21,10 +21,6 @@
|
|||
<name>webCrawlInputPath</name>
|
||||
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>publisherInputPath</name>
|
||||
<description>the path where to find the inferred affiliation relations from publisher websites</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
|
@ -121,7 +117,6 @@
|
|||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
||||
<arg>--publisherInputPath</arg><arg>${publisherInputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -16,5 +16,11 @@
|
|||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bp",
|
||||
"paramLongName": "backupPath",
|
||||
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
|
|
@ -30,12 +30,6 @@
|
|||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bp",
|
||||
"paramLongName": "backupPath",
|
||||
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
|
|
@ -94,17 +94,7 @@
|
|||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/Original</arg>
|
||||
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||
</java>
|
||||
<ok to="read"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="extract_correspondence">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/correspondence</arg>
|
||||
<arg>--outputPath</arg><arg>${inputPath}/correspondence_extracted</arg>
|
||||
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
||||
</java>
|
||||
<ok to="read"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -129,7 +119,6 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
||||
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
</spark>
|
||||
|
|
|
@ -16,8 +16,7 @@
|
|||
"paramLongName": "isSparkSessionManged",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
},{
|
||||
"paramName": "nn",
|
||||
"paramLongName": "nameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -33,14 +33,6 @@
|
|||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="download"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="download">
|
||||
<shell xmlns="uri:oozie:shell-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
|
|
|
@ -8,40 +8,19 @@
|
|||
<name>database</name>
|
||||
<description>the PDB Database Working Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
<name>targetPath</name>
|
||||
<description>the Target Working dir path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="StartTransaction"/>
|
||||
|
||||
<start to="ConvertDB"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="ConvertDB"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
<action name="ConvertDB">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -62,48 +41,11 @@
|
|||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--dbPath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--database</arg><arg>${database}</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
</spark>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="RollBack"/>
|
||||
|
||||
</action>
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -2,5 +2,5 @@
|
|||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
|
||||
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
|
||||
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the OAF target path ", "paramRequired": true}
|
||||
]
|
|
@ -1,20 +1,5 @@
|
|||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source Path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mo",
|
||||
"paramLongName": "mdstoreOutputVersion",
|
||||
"paramDescription": "the oaf path ",
|
||||
"paramRequired": true
|
||||
}
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true}
|
||||
]
|
|
@ -9,26 +9,34 @@
|
|||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
<name>targetPath</name>
|
||||
<description>the OAF MDStore Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>resumeFrom</name>
|
||||
<value>CreateEBIDataSet</value>
|
||||
<value>DownloadEBILinks</value>
|
||||
<description>node to start</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="StartTransaction"/>
|
||||
<start to="resume_from"/>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
|
||||
<case to="StartTransaction">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
||||
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
||||
<default to="DownloadEBILinks"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -69,29 +77,9 @@
|
|||
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
|
||||
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
|
||||
</fs>
|
||||
<ok to="StartTransaction"/>
|
||||
<ok to="CreateEBIDataSet"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="CreateEBIDataSet"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="CreateEBIDataSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
|
@ -107,49 +95,11 @@
|
|||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -407,10 +407,9 @@ object DataciteToOAFTransformation {
|
|||
)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(
|
||||
a.setRawAffiliationString(
|
||||
c.affiliation.get
|
||||
.filter(af => af.nonEmpty)
|
||||
.map(af => OafMapperUtils.field(af, dataInfo))
|
||||
.asJava
|
||||
)
|
||||
a.setRank(idx + 1)
|
||||
|
|
|
@ -231,7 +231,7 @@ object BioDBToOAF {
|
|||
def uniprotToOAF(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pid = (json \ "pid").extract[String].trim()
|
||||
val pid = (json \ "pid").extract[String]
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
|
|
|
@ -2,15 +2,12 @@ package eu.dnetlib.dhp.sx.bio
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
||||
|
||||
object SparkTransformBioDatabaseToOAF {
|
||||
|
||||
|
@ -28,13 +25,8 @@ object SparkTransformBioDatabaseToOAF {
|
|||
|
||||
val dbPath: String = parser.get("dbPath")
|
||||
log.info("dbPath: {}", database)
|
||||
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
||||
|
||||
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info("outputBasePath: {}", outputBasePath)
|
||||
val targetPath: String = parser.get("targetPath")
|
||||
log.info("targetPath: {}", database)
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
|
@ -51,28 +43,24 @@ object SparkTransformBioDatabaseToOAF {
|
|||
case "UNIPROT" =>
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||
targetPath
|
||||
)
|
||||
case "PDB" =>
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||
targetPath
|
||||
)
|
||||
case "SCHOLIX" =>
|
||||
CollectionUtils.saveDataset(
|
||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||
targetPath
|
||||
)
|
||||
case "CROSSREF_LINKS" =>
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
|
||||
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
||||
val mdStoreSize = df.count
|
||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,9 +9,6 @@ import org.apache.commons.io.IOUtils
|
|||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
||||
|
||||
object SparkEBILinksToOaf {
|
||||
|
||||
|
@ -35,13 +32,8 @@ object SparkEBILinksToOaf {
|
|||
import spark.implicits._
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
||||
|
||||
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info("outputBasePath: {}", outputBasePath)
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
|
||||
val ebLinks: Dataset[EBILinkItem] = spark.read
|
||||
|
@ -54,10 +46,7 @@ object SparkEBILinksToOaf {
|
|||
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||
targetPath
|
||||
)
|
||||
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
||||
val mdStoreSize = df.count
|
||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,7 +28,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
|
@ -40,8 +39,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
private static Path workingDir;
|
||||
private static final String ID_PREFIX = "50|doi_________::";
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(PrepareAffiliationRelationsTest.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelationsTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
|
|
|
@ -77,13 +77,13 @@ public class RemapTest {
|
|||
MapOCIdsInPids
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManged",
|
||||
"-isSparkSessionManged",
|
||||
Boolean.FALSE.toString(),
|
||||
"--inputPath",
|
||||
"-inputPath",
|
||||
inputPath,
|
||||
"--outputPath",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/out/",
|
||||
"--nameNode", "hdfs://localhost"
|
||||
"-nameNode", "input1;input2;input3;input4;input5"
|
||||
});
|
||||
|
||||
}
|
||||
|
|
|
@ -1,44 +1,15 @@
|
|||
{"classification": "Signaling protein", "pdb": "5NM4", "deposition_date": "2017-04-05", "title": "A2a adenosine receptor room-temperature structure determined by serial Femtosecond crystallography", "Keywords": ["Oom-temperature", " serial crystallography", " signaling protein"], "authors": ["T.weinert", "R.cheng", "D.james", "D.gashi", "P.nogly", "K.jaeger", "M.hennig", "", "J.standfuss"], "pmid": "28912485", "doi": "10.1038/S41467-017-00630-4"}
|
||||
{"classification": "Oxidoreductase/oxidoreductase inhibitor", "pdb": "4KN3", "deposition_date": "2013-05-08", "title": "Structure of the y34ns91g double mutant of dehaloperoxidase from Amphitrite ornata with 2,4,6-trichlorophenol", "Keywords": ["Lobin", " oxygen storage", " peroxidase", " oxidoreductase", " oxidoreductase-", "Oxidoreductase inhibitor complex"], "authors": ["C.wang", "L.lovelace", "L.lebioda"], "pmid": "23952341", "doi": "10.1021/BI400627W"}
|
||||
{"classification": "Transport protein", "pdb": "8HKM", "deposition_date": "2022-11-27", "title": "Ion channel", "Keywords": ["On channel", " transport protein"], "authors": ["D.h.jiang", "J.t.zhang"], "pmid": "37494189", "doi": "10.1016/J.CELREP.2023.112858"}
|
||||
{"classification": "Signaling protein", "pdb": "6JT1", "deposition_date": "2019-04-08", "title": "Structure of human soluble guanylate cyclase in the heme oxidised State", "Keywords": ["Oluble guanylate cyclase", " signaling protein"], "authors": ["L.chen", "Y.kang", "R.liu", "J.-x.wu"], "pmid": "31514202", "doi": "10.1038/S41586-019-1584-6"}
|
||||
{"classification": "Immune system", "pdb": "7OW6", "deposition_date": "2021-06-16", "title": "Crystal structure of a tcr in complex with hla-a*11:01 bound to kras G12d peptide (vvvgadgvgk)", "Keywords": ["La", " kras", " tcr", " immune system"], "authors": ["V.karuppiah", "R.a.robinson"], "doi": "10.1038/S41467-022-32811-1"}
|
||||
{"classification": "Biosynthetic protein", "pdb": "5EQ8", "deposition_date": "2015-11-12", "title": "Crystal structure of medicago truncatula histidinol-phosphate Phosphatase (mthpp) in complex with l-histidinol", "Keywords": ["Istidine biosynthesis", " metabolic pathways", " dimer", " plant", "", "Biosynthetic protein"], "authors": ["M.ruszkowski", "Z.dauter"], "pmid": "26994138", "doi": "10.1074/JBC.M115.708727"}
|
||||
{"classification": "De novo protein", "pdb": "8CWA", "deposition_date": "2022-05-18", "title": "Solution nmr structure of 8-residue rosetta-designed cyclic peptide D8.21 in cdcl3 with cis/trans switching (tc conformation, 53%)", "Keywords": ["Yclic peptide", " non natural amino acids", " cis/trans", " switch peptides", "", "De novo design", "Membrane permeability", "De novo protein"], "authors": ["T.a.ramelot", "R.tejero", "G.t.montelione"], "pmid": "36041435", "doi": "10.1016/J.CELL.2022.07.019"}
|
||||
{"classification": "Hydrolase", "pdb": "3R6M", "deposition_date": "2011-03-21", "title": "Crystal structure of vibrio parahaemolyticus yeaz", "Keywords": ["Ctin/hsp70 nucleotide-binding fold", " bacterial resuscitation", " viable", "But non-culturable state", "Resuscitation promoting factor", "Ygjd", "", "Yjee", "Vibrio parahaemolyticus", "Hydrolase"], "authors": ["A.roujeinikova", "I.aydin"], "pmid": "21858042", "doi": "10.1371/JOURNAL.PONE.0023245"}
|
||||
{"classification": "Hydrolase", "pdb": "2W5J", "deposition_date": "2008-12-10", "title": "Structure of the c14-rotor ring of the proton translocating Chloroplast atp synthase", "Keywords": ["Ydrolase", " chloroplast", " atp synthase", " lipid-binding", " cf(0)", " membrane", "", "Transport", "Formylation", "Energy transduction", "Hydrogen ion transport", "", "Ion transport", "Transmembrane", "Membrane protein"], "authors": ["M.vollmar", "D.schlieper", "M.winn", "C.buechner", "G.groth"], "pmid": "19423706", "doi": "10.1074/JBC.M109.006916"}
|
||||
{"classification": "De novo protein", "pdb": "4GLU", "deposition_date": "2012-08-14", "title": "Crystal structure of the mirror image form of vegf-a", "Keywords": ["-protein", " covalent dimer", " cysteine knot protein", " growth factor", " de", "Novo protein"], "authors": ["K.mandal", "M.uppalapati", "D.ault-riche", "J.kenney", "J.lowitz", "S.sidhu", "", "S.b.h.kent"], "pmid": "22927390", "doi": "10.1073/PNAS.1210483109"}
|
||||
{"classification": "Hydrolase/hydrolase inhibitor", "pdb": "3WYL", "deposition_date": "2014-09-01", "title": "Crystal structure of the catalytic domain of pde10a complexed with 5- Methoxy-3-(1-phenyl-1h-pyrazol-5-yl)-1-(3-(trifluoromethyl)phenyl) Pyridazin-4(1h)-one", "Keywords": ["Ydrolase-hydrolase inhibitor complex"], "authors": ["H.oki", "Y.hayano"], "pmid": "25384088", "doi": "10.1021/JM5013648"}
|
||||
{"classification": "Isomerase", "pdb": "5BOR", "deposition_date": "2015-05-27", "title": "Structure of acetobacter aceti pure-s57c, sulfonate form", "Keywords": ["Cidophile", " pure", " purine biosynthesis", " isomerase"], "authors": ["K.l.sullivan", "T.j.kappock"]}
|
||||
{"classification": "Hydrolase", "pdb": "1X0C", "deposition_date": "2005-03-17", "title": "Improved crystal structure of isopullulanase from aspergillus niger Atcc 9642", "Keywords": ["Ullulan", " glycoside hydrolase family 49", " glycoprotein", " hydrolase"], "authors": ["M.mizuno", "T.tonozuka", "A.yamamura", "Y.miyasaka", "H.akeboshi", "S.kamitori", "", "A.nishikawa", "Y.sakano"], "pmid": "18155243", "doi": "10.1016/J.JMB.2007.11.098"}
|
||||
{"classification": "Oxidoreductase", "pdb": "7CUP", "deposition_date": "2020-08-23", "title": "Structure of 2,5-dihydroxypridine dioxygenase from pseudomonas putida Kt2440", "Keywords": ["On-heme dioxygenase", " oxidoreductase"], "authors": ["G.q.liu", "H.z.tang"]}
|
||||
{"classification": "Ligase", "pdb": "1VCN", "deposition_date": "2004-03-10", "title": "Crystal structure of t.th. hb8 ctp synthetase complex with sulfate Anion", "Keywords": ["Etramer", " riken structural genomics/proteomics initiative", " rsgi", "", "Structural genomics", "Ligase"], "authors": ["M.goto", "Riken structural genomics/proteomics initiative (rsgi)"], "pmid": "15296735", "doi": "10.1016/J.STR.2004.05.013"}
|
||||
{"classification": "Transferase/transferase inhibitor", "pdb": "6C9V", "deposition_date": "2018-01-28", "title": "Mycobacterium tuberculosis adenosine kinase bound to (2r,3s,4r,5r)-2- (hydroxymethyl)-5-(6-(4-phenylpiperazin-1-yl)-9h-purin-9-yl) Tetrahydrofuran-3,4-diol", "Keywords": ["Ucleoside analog", " complex", " inhibitor", " structural genomics", " psi-2", "", "Protein structure initiative", "Tb structural genomics consortium", "", "Tbsgc", "Transferase-transferase inhibitor complex"], "authors": ["R.a.crespo", "Tb structural genomics consortium (tbsgc)"], "pmid": "31002508", "doi": "10.1021/ACS.JMEDCHEM.9B00020"}
|
||||
{"classification": "De novo protein", "pdb": "4LPY", "deposition_date": "2013-07-16", "title": "Crystal structure of tencon variant g10", "Keywords": ["Ibronectin type iii fold", " alternate scaffold", " de novo protein"], "authors": ["A.teplyakov", "G.obmolova", "G.l.gilliland"], "pmid": "24375666", "doi": "10.1002/PROT.24502"}
|
||||
{"classification": "Isomerase", "pdb": "2Y88", "deposition_date": "2011-02-03", "title": "Crystal structure of mycobacterium tuberculosis phosphoribosyl Isomerase (variant d11n) with bound prfar", "Keywords": ["Romatic amino acid biosynthesis", " isomerase", " tim-barrel", " histidine", "Biosynthesis", "Tryptophan biosynthesis"], "authors": ["J.kuper", "A.v.due", "A.geerlof", "M.wilmanns"], "pmid": "21321225", "doi": "10.1073/PNAS.1015996108"}
|
||||
{"classification": "Unknown function", "pdb": "1SR0", "deposition_date": "2004-03-22", "title": "Crystal structure of signalling protein from sheep(sps-40) at 3.0a Resolution using crystal grown in the presence of polysaccharides", "Keywords": ["Ignalling protein", " involution", " unknown function"], "authors": ["D.b.srivastava", "A.s.ethayathulla", "N.singh", "J.kumar", "S.sharma", "T.p.singh"]}
|
||||
{"classification": "Dna binding protein", "pdb": "3RH2", "deposition_date": "2011-04-11", "title": "Crystal structure of a tetr-like transcriptional regulator (sama_0099) From shewanella amazonensis sb2b at 2.42 a resolution", "Keywords": ["Na/rna-binding 3-helical bundle", " structural genomics", " joint center", "For structural genomics", "Jcsg", "Protein structure initiative", "Psi-", "Biology", "Dna binding protein"], "authors": ["Joint center for structural genomics (jcsg)"]}
|
||||
{"classification": "Transferase", "pdb": "2WK5", "deposition_date": "2009-06-05", "title": "Structural features of native human thymidine phosphorylase And in complex with 5-iodouracil", "Keywords": ["Lycosyltransferase", " developmental protein", " angiogenesis", "", "5-iodouracil", "Growth factor", "Enzyme kinetics", "", "Differentiation", "Disease mutation", "Thymidine", "Phosphorylase", "Chemotaxis", "Transferase", "Mutagenesis", "", "Polymorphism"], "authors": ["E.mitsiki", "A.c.papageorgiou", "S.iyer", "N.thiyagarajan", "S.h.prior", "", "D.sleep", "C.finnis", "K.r.acharya"], "pmid": "19555658", "doi": "10.1016/J.BBRC.2009.06.104"}
|
||||
{"classification": "Hydrolase", "pdb": "3P9Y", "deposition_date": "2010-10-18", "title": "Crystal structure of the drosophila melanogaster ssu72-pctd complex", "Keywords": ["Hosphatase", " cis proline", " lmw ptp-like fold", " rna polymerase ii ctd", "", "Hydrolase"], "authors": ["J.w.werner-allen", "P.zhou"], "pmid": "21159777", "doi": "10.1074/JBC.M110.197129"}
|
||||
{"classification": "Recombination/dna", "pdb": "6OEO", "deposition_date": "2019-03-27", "title": "Cryo-em structure of mouse rag1/2 nfc complex (dna1)", "Keywords": ["(d)j recombination", " dna transposition", " rag", " scid", " recombination", "", "Recombination-dna complex"], "authors": ["X.chen", "Y.cui", "Z.h.zhou", "W.yang", "M.gellert"], "pmid": "32015552", "doi": "10.1038/S41594-019-0363-2"}
|
||||
{"classification": "Hydrolase", "pdb": "4ECA", "deposition_date": "1997-02-21", "title": "Asparaginase from e. coli, mutant t89v with covalently bound aspartate", "Keywords": ["Ydrolase", " acyl-enzyme intermediate", " threonine amidohydrolase"], "authors": ["G.j.palm", "J.lubkowski", "A.wlodawer"], "pmid": "8706862", "doi": "10.1016/0014-5793(96)00660-6"}
|
||||
{"classification": "Transcription/protein binding", "pdb": "3UVX", "deposition_date": "2011-11-30", "title": "Crystal structure of the first bromodomain of human brd4 in complex With a diacetylated histone 4 peptide (h4k12ack16ac)", "Keywords": ["Romodomain", " bromodomain containing protein 4", " cap", " hunk1", " mcap", "", "Mitotic chromosome associated protein", "Peptide complex", "Structural", "Genomics consortium", "Sgc", "Transcription-protein binding complex"], "authors": ["P.filippakopoulos", "S.picaud", "T.keates", "E.ugochukwu", "F.von delft", "", "C.h.arrowsmith", "A.m.edwards", "J.weigelt", "C.bountra", "S.knapp", "Structural", "Genomics consortium (sgc)"], "pmid": "22464331", "doi": "10.1016/J.CELL.2012.02.013"}
|
||||
{"classification": "Membrane protein", "pdb": "1TLZ", "deposition_date": "2004-06-10", "title": "Tsx structure complexed with uridine", "Keywords": ["Ucleoside transporter", " beta barrel", " uridine", " membrane", "Protein"], "authors": ["J.ye", "B.van den berg"], "pmid": "15272310", "doi": "10.1038/SJ.EMBOJ.7600330"}
|
||||
{"classification": "Dna binding protein", "pdb": "7AZD", "deposition_date": "2020-11-16", "title": "Dna polymerase sliding clamp from escherichia coli with peptide 20 Bound", "Keywords": ["Ntibacterial drug", " dna binding protein"], "authors": ["C.monsarrat", "G.compain", "C.andre", "I.martiel", "S.engilberge", "V.olieric", "", "P.wolff", "K.brillet", "M.landolfo", "C.silva da veiga", "J.wagner", "G.guichard", "", "D.y.burnouf"], "pmid": "34806883", "doi": "10.1021/ACS.JMEDCHEM.1C00918"}
|
||||
{"classification": "Transferase", "pdb": "5N3K", "deposition_date": "2017-02-08", "title": "Camp-dependent protein kinase a from cricetulus griseus in complex With fragment like molecule o-guanidino-l-homoserine", "Keywords": ["Ragment", " complex", " transferase", " serine threonine kinase", " camp", "", "Kinase", "Pka"], "authors": ["C.siefker", "A.heine", "G.klebe"]}
|
||||
{"classification": "Biosynthetic protein", "pdb": "8H52", "deposition_date": "2022-10-11", "title": "Crystal structure of helicobacter pylori carboxyspermidine Dehydrogenase in complex with nadp", "Keywords": ["Arboxyspermidine dehydrogenase", " biosynthetic protein"], "authors": ["K.y.ko", "S.c.park", "S.y.cho", "S.i.yoon"], "pmid": "36283333", "doi": "10.1016/J.BBRC.2022.10.049"}
|
||||
{"classification": "Metal binding protein", "pdb": "6DYC", "deposition_date": "2018-07-01", "title": "Co(ii)-bound structure of the engineered cyt cb562 variant, ch3", "Keywords": ["Esigned protein", " 4-helix bundle", " electron transport", " metal binding", "Protein"], "authors": ["F.a.tezcan", "J.rittle"], "pmid": "30778140", "doi": "10.1038/S41557-019-0218-9"}
|
||||
{"classification": "Protein fibril", "pdb": "6A6B", "deposition_date": "2018-06-27", "title": "Cryo-em structure of alpha-synuclein fiber", "Keywords": ["Lpha-syn fiber", " parkinson disease", " protein fibril"], "authors": ["Y.w.li", "C.y.zhao", "F.luo", "Z.liu", "X.gui", "Z.luo", "X.zhang", "D.li", "C.liu", "X.li"], "pmid": "30065316", "doi": "10.1038/S41422-018-0075-X"}
|
||||
{"classification": "Dna", "pdb": "7D5E", "deposition_date": "2020-09-25", "title": "Left-handed g-quadruplex containing two bulges", "Keywords": ["-quadruplex", " bulge", " dna", " left-handed"], "authors": ["P.das", "A.maity", "K.h.ngo", "F.r.winnerdy", "B.bakalar", "Y.mechulam", "E.schmitt", "", "A.t.phan"], "pmid": "33503265", "doi": "10.1093/NAR/GKAA1259"}
|
||||
{"classification": "Transferase", "pdb": "3RSY", "deposition_date": "2011-05-02", "title": "Cellobiose phosphorylase from cellulomonas uda in complex with sulfate And glycerol", "Keywords": ["H94", " alpha barrel", " cellobiose phosphorylase", " disaccharide", "Phosphorylase", "Transferase"], "authors": ["A.van hoorebeke", "J.stout", "W.soetaert", "J.van beeumen", "T.desmet", "S.savvides"]}
|
||||
{"classification": "Oxidoreductase", "pdb": "7MCI", "deposition_date": "2021-04-02", "title": "Mofe protein from azotobacter vinelandii with a sulfur-replenished Cofactor", "Keywords": ["Zotobacter vinelandii", " mofe-protein", " nitrogenase", " oxidoreductase"], "authors": ["W.kang", "C.lee", "Y.hu", "M.w.ribbe"], "doi": "10.1038/S41929-022-00782-7"}
|
||||
{"classification": "Dna", "pdb": "1XUW", "deposition_date": "2004-10-26", "title": "Structural rationalization of a large difference in rna affinity Despite a small difference in chemistry between two 2'-o-modified Nucleic acid analogs", "Keywords": ["Na mimetic methylcarbamate amide analog", " dna"], "authors": ["R.pattanayek", "L.sethaphong", "C.pan", "M.prhavc", "T.p.prakash", "M.manoharan", "", "M.egli"], "pmid": "15547979", "doi": "10.1021/JA044637K"}
|
||||
{"classification": "Lyase", "pdb": "7C0D", "deposition_date": "2020-05-01", "title": "Crystal structure of azospirillum brasilense l-2-keto-3-deoxyarabonate Dehydratase (hydroxypyruvate-bound form)", "Keywords": ["-2-keto-3-deoxyarabonate dehydratase", " lyase"], "authors": ["Y.watanabe", "S.watanabe"], "pmid": "32697085", "doi": "10.1021/ACS.BIOCHEM.0C00515"}
|
||||
{"classification": "Signaling protein", "pdb": "5LYK", "deposition_date": "2016-09-28", "title": "Crystal structure of intracellular b30.2 domain of btn3a1 bound to Citrate", "Keywords": ["30.2", " butyrophilin", " signaling protein"], "authors": ["F.mohammed", "A.t.baker", "M.salim", "B.e.willcox"], "pmid": "28862425", "doi": "10.1021/ACSCHEMBIO.7B00694"}
|
||||
{"classification": "Toxin", "pdb": "4IZL", "deposition_date": "2013-01-30", "title": "Structure of the n248a mutant of the panton-valentine leucocidin s Component from staphylococcus aureus", "Keywords": ["I-component leucotoxin", " staphylococcus aureus", " s component", "Leucocidin", "Beta-barrel pore forming toxin", "Toxin"], "authors": ["L.maveyraud", "B.j.laventie", "G.prevost", "L.mourey"], "pmid": "24643034", "doi": "10.1371/JOURNAL.PONE.0092094"}
|
||||
{"classification": "Dna", "pdb": "6F3C", "deposition_date": "2017-11-28", "title": "The cytotoxic [pt(h2bapbpy)] platinum complex interacting with the Cgtacg hexamer", "Keywords": ["Rug-dna complex", " four-way junction", " dna"], "authors": ["M.ferraroni", "C.bazzicalupi", "P.gratteri", "F.papi"], "pmid": "31046177", "doi": "10.1002/ANIE.201814532"}
|
||||
{"classification": "Signaling protein/inhibitor", "pdb": "4L5M", "deposition_date": "2013-06-11", "title": "Complexe of arno sec7 domain with the protein-protein interaction Inhibitor n-(4-hydroxy-2,6-dimethylphenyl)benzenesulfonamide at ph6.5", "Keywords": ["Ec-7domain", " signaling protein-inhibitor complex"], "authors": ["F.hoh", "J.rouhana"], "pmid": "24112024", "doi": "10.1021/JM4009357"}
|
||||
{"classification": "Signaling protein", "pdb": "5I6J", "deposition_date": "2016-02-16", "title": "Crystal structure of srgap2 f-barx", "Keywords": ["Rgap2", " f-bar", " fx", " signaling protein"], "authors": ["M.sporny", "J.guez-haddad", "M.n.isupov", "Y.opatowsky"], "pmid": "28333212", "doi": "10.1093/MOLBEV/MSX094"}
|
||||
{"classification": "Metal binding protein", "pdb": "1Q80", "deposition_date": "2003-08-20", "title": "Solution structure and dynamics of nereis sarcoplasmic calcium binding Protein", "Keywords": ["Ll-alpha", " metal binding protein"], "authors": ["G.rabah", "R.popescu", "J.a.cox", "Y.engelborghs", "C.t.craescu"], "pmid": "15819893", "doi": "10.1111/J.1742-4658.2005.04629.X"}
|
||||
{"classification": "Transferase", "pdb": "1TW1", "deposition_date": "2004-06-30", "title": "Beta-1,4-galactosyltransferase mutant met344his (m344h-gal-t1) complex With udp-galactose and magnesium", "Keywords": ["Et344his mutation; closed conformation; mn binding", " transferase"], "authors": ["B.ramakrishnan", "E.boeggeman", "P.k.qasba"], "pmid": "15449940", "doi": "10.1021/BI049007+"}
|
||||
{"classification": "Rna", "pdb": "2PN4", "deposition_date": "2007-04-23", "title": "Crystal structure of hepatitis c virus ires subdomain iia", "Keywords": ["Cv", " ires", " subdoamin iia", " rna", " strontium", " hepatitis"], "authors": ["Q.zhao", "Q.han", "C.r.kissinger", "P.a.thompson"], "pmid": "18391410", "doi": "10.1107/S0907444908002011"}
|
||||
{"pdb": "1CW0", "title": "crystal structure analysis of very short patch repair (vsr) endonuclease in complex with a duplex dna", "authors": ["S.E.Tsutakawa", "H.Jingami", "K.Morikawa"], "doi": "10.1016/S0092-8674(00)81550-0", "pmid": "10612397"}
|
||||
{"pdb": "2CWW", "title": "crystal structure of thermus thermophilus ttha1280, a putative sam- dependent rna methyltransferase, in complex with s-adenosyl-l- homocysteine", "authors": ["A.A.Pioszak", "K.Murayama", "N.Nakagawa", "A.Ebihara", "S.Kuramitsu", "M.Shirouzu", "S.Yokoyama", "Riken Structural Genomics/proteomics Initiative (Rsgi)"], "doi": "10.1107/S1744309105029842", "pmid": "16511182"}
|
||||
{"pdb": "6CWE", "title": "structure of alpha-gsa[8,6p] bound by cd1d and in complex with the va14vb8.2 tcr", "authors": ["J.Wang", "D.Zajonc"], "doi": null, "pmid": null}
|
||||
{"pdb": "5CWS", "title": "crystal structure of the intact chaetomium thermophilum nsp1-nup49- nup57 channel nucleoporin heterotrimer bound to its nic96 nuclear pore complex attachment site", "authors": ["C.J.Bley", "S.Petrovic", "M.Paduch", "V.Lu", "A.A.Kossiakoff", "A.Hoelz"], "doi": "10.1126/SCIENCE.AAC9176", "pmid": "26316600"}
|
||||
{"pdb": "5CWE", "title": "structure of cyp107l2 from streptomyces avermitilis with lauric acid", "authors": ["T.-V.Pham", "S.-H.Han", "J.-H.Kim", "D.-H.Kim", "L.-W.Kang"], "doi": null, "pmid": null}
|
||||
{"pdb": "7CW4", "title": "acetyl-coa acetyltransferase from bacillus cereus atcc 14579", "authors": ["J.Hong", "K.J.Kim"], "doi": "10.1016/J.BBRC.2020.09.048", "pmid": "32972748"}
|
||||
{"pdb": "2CWP", "title": "crystal structure of metrs related protein from pyrococcus horikoshii", "authors": ["K.Murayama", "M.Kato-Murayama", "M.Shirouzu", "S.Yokoyama", "Riken StructuralGenomics/proteomics Initiative (Rsgi)"], "doi": null, "pmid": null}
|
||||
{"pdb": "2CW7", "title": "crystal structure of intein homing endonuclease ii", "authors": ["H.Matsumura", "H.Takahashi", "T.Inoue", "H.Hashimoto", "M.Nishioka", "S.Fujiwara", "M.Takagi", "T.Imanaka", "Y.Kai"], "doi": "10.1002/PROT.20858", "pmid": "16493661"}
|
||||
{"pdb": "1CWU", "title": "brassica napus enoyl acp reductase a138g mutant complexed with nad+ and thienodiazaborine", "authors": ["A.Roujeinikova", "J.B.Rafferty", "D.W.Rice"], "doi": "10.1074/JBC.274.43.30811", "pmid": "10521472"}
|
||||
{"pdb": "3CWN", "title": "escherichia coli transaldolase b mutant f178y", "authors": ["T.Sandalova", "G.Schneider", "A.Samland"], "doi": "10.1074/JBC.M803184200", "pmid": "18687684"}
|
||||
{"pdb": "1CWL", "title": "human cyclophilin a complexed with 4 4-hydroxy-meleu cyclosporin", "authors": ["V.Mikol", "J.Kallen", "P.Taylor", "M.D.Walkinshaw"], "doi": "10.1006/JMBI.1998.2108", "pmid": "9769216"}
|
||||
{"pdb": "3CW2", "title": "crystal structure of the intact archaeal translation initiation factor 2 from sulfolobus solfataricus .", "authors": ["E.A.Stolboushkina", "S.V.Nikonov", "A.D.Nikulin", "U.Blaesi", "D.J.Manstein", "R.V.Fedorov", "M.B.Garber", "O.S.Nikonov"], "doi": "10.1016/J.JMB.2008.07.039", "pmid": "18675278"}
|
||||
{"pdb": "3CW9", "title": "4-chlorobenzoyl-coa ligase/synthetase in the thioester-forming conformation, bound to 4-chlorophenacyl-coa", "authors": ["A.S.Reger", "J.Cao", "R.Wu", "D.Dunaway-Mariano", "A.M.Gulick"], "doi": "10.1021/BI800696Y", "pmid": "18620418"}
|
||||
{"pdb": "3CWU", "title": "crystal structure of an alka host/guest complex 2'-fluoro-2'-deoxy-1, n6-ethenoadenine:thymine base pair", "authors": ["B.R.Bowman", "S.Lee", "S.Wang", "G.L.Verdine"], "doi": "10.1016/J.STR.2008.04.012", "pmid": "18682218"}
|
||||
{"pdb": "5CWF", "title": "crystal structure of de novo designed helical repeat protein dhr8", "authors": ["G.Bhabha", "D.C.Ekiert"], "doi": "10.1038/NATURE16162", "pmid": "26675729"}
|
|
@ -1,36 +1,6 @@
|
|||
{"pid": " Q6GZX4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 43"}], "title": "Putative transcription factor 001R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZX3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 45"}], "title": "Uncharacterized protein 002L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q197F8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 29"}], "title": "Uncharacterized protein 002R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q197F7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 23"}], "title": "Uncharacterized protein 003L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q6GZX2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 3R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZX1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 38"}], "title": "Uncharacterized protein 004R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q197F5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 005L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q6GZX0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 47"}], "title": "Uncharacterized protein 005R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q91G88", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-06-28", "date_info": "entry version 53"}], "title": "Putative KilA-N domain-containing protein 006L", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
||||
{"pid": " Q6GZW9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 34"}], "title": "Uncharacterized protein 006R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZW8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 32"}], "title": "Uncharacterized protein 007R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q197F3", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 007R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q197F2", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 22"}], "title": "Uncharacterized protein 008L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q6GZW6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 67"}], "title": "Putative helicase 009L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q91G85", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 38"}], "title": "Uncharacterized protein 009R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
||||
{"pid": " Q6GZW5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 010R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q197E9", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 011L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q6GZW4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 011R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZW3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 012L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q197E7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 37"}], "title": "Uncharacterized protein IIV3-013L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q6GZW2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 30"}], "title": "Uncharacterized protein 013R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZW1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 014R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZW0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 50"}], "title": "Uncharacterized protein 015R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZV8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 017L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZV7", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 018L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZV6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 87"}], "title": "Putative serine/threonine-protein kinase 019R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZV5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 40"}], "title": "Uncharacterized protein 020R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZV4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 021L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q197D8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-12-14", "date_info": "entry version 35"}], "title": "Transmembrane protein 022L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q6GZV2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 023R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q197D7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 25"}], "title": "Uncharacterized protein 023R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q6GZV1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 024R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q197D5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 24"}], "title": "Uncharacterized protein 025R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": " Q91G70", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 026R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
||||
{"pid": " Q6GZU9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 49"}], "title": "Uncharacterized protein 027R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": " Q6GZU8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 55"}], "title": "Uncharacterized protein 028R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": "Q6GZX4", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 41."}], "title": "Putative transcription factor 001R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": "Q6GZX3", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 42."}], "title": "Uncharacterized protein 002L;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": "Q197F8", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 27."}], "title": "Uncharacterized protein 002R;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": "Q197F7", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 23."}], "title": "Uncharacterized protein 003L;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
|
||||
{"pid": "Q6GZX2", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 36."}], "title": "Uncharacterized protein 3R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||
{"pid": "Q6GZX1", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 34."}], "title": "Uncharacterized protein 004R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
@ -26,7 +26,7 @@ class MAGMappingTest {
|
|||
@Test
|
||||
def mappingMagType(): Unit = {
|
||||
|
||||
checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = true, "Other literature type")
|
||||
checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = false, "Other literature type")
|
||||
checkResult[Publication](
|
||||
MagUtility.createResultFromType(Some("BookChapter"), null),
|
||||
invisible = false,
|
||||
|
|
|
@ -17,6 +17,45 @@ import eu.dnetlib.pace.tree.support.TreeStats;
|
|||
|
||||
class DecisionTreeTest {
|
||||
|
||||
@Test
|
||||
void testJPath() throws IOException {
|
||||
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json")));
|
||||
|
||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json"));
|
||||
|
||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||
|
||||
System.out.println("row = " + row);
|
||||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
|
||||
System.out.println("row = " + row.getAs("countrytitle"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void jsonToModelTest() throws IOException {
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOpenorgsDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
|
||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||
|
||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||
// to check that the same parsing returns the same row
|
||||
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||
|
||||
Assertions.assertEquals(row, row1);
|
||||
System.out.println("row = " + row);
|
||||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
}
|
||||
|
||||
@Test
|
||||
void organizationDecisionTreeTest() throws Exception {
|
||||
DedupConfig conf = DedupConfig
|
||||
|
|
|
@ -452,18 +452,18 @@ public class SparkDedupTest implements Serializable {
|
|||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||
assertFalse(dups.contains(r.getTarget()));
|
||||
assertTrue(dups.contains(r.getTarget()));
|
||||
});
|
||||
|
||||
final List<Relation> mergedIn = pubs
|
||||
.filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
||||
.collectAsList();
|
||||
assertEquals(1, mergedIn.size());
|
||||
assertEquals(3, mergedIn.size());
|
||||
mergedIn.forEach(r -> {
|
||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
||||
assertFalse(dups.contains(r.getSource()));
|
||||
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||
assertTrue(dups.contains(r.getSource()));
|
||||
});
|
||||
|
||||
System.out.println("orgs_mergerel = " + orgs_mergerel);
|
||||
|
@ -473,8 +473,8 @@ public class SparkDedupTest implements Serializable {
|
|||
System.out.println("orp_mergerel = " + orp_mergerel);
|
||||
|
||||
if (CHECK_CARDINALITIES) {
|
||||
assertEquals(1278, orgs_mergerel);
|
||||
assertEquals(1158, pubs.count());
|
||||
assertEquals(1268, orgs_mergerel);
|
||||
assertEquals(1156, pubs.count());
|
||||
assertEquals(292, sw_mergerel);
|
||||
assertEquals(476, ds_mergerel);
|
||||
assertEquals(742, orp_mergerel);
|
||||
|
|
|
@ -241,6 +241,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
|||
|
||||
verifyRoot_case_1(roots, pubs);
|
||||
verifyRoot_case_2(roots, pubs);
|
||||
verifyRoot_case_3(roots, pubs);
|
||||
}
|
||||
|
||||
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
||||
|
@ -321,6 +322,34 @@ public class SparkPublicationRootsTest implements Serializable {
|
|||
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
||||
}
|
||||
|
||||
private void verifyRoot_case_3(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
||||
Publication root = roots
|
||||
.filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'")
|
||||
.first();
|
||||
assertNotNull(root);
|
||||
|
||||
Publication pivot_duplicate = pubs
|
||||
.filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'")
|
||||
.first();
|
||||
|
||||
assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||
|
||||
Set<String> dups_cf = pubs
|
||||
.collectAsList()
|
||||
.stream()
|
||||
.flatMap(p -> p.getCollectedfrom().stream())
|
||||
.map(KeyValue::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
Set<String> root_cf = root
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(KeyValue::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(6)
|
||||
void updateEntityTest() throws Exception {
|
||||
|
|
|
@ -143,9 +143,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
|||
"--graphBasePath", graphInputPath,
|
||||
"--actionSetId", testActionSetId,
|
||||
"--isLookUpUrl", "lookupurl",
|
||||
"--workingPath", workingPath,
|
||||
"--hiveMetastoreUris", "none",
|
||||
"--pivotHistoryDatabase", ""
|
||||
"--workingPath", workingPath
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
|
@ -155,7 +153,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
|||
.as(Encoders.bean(Relation.class));
|
||||
|
||||
assertEquals(
|
||||
4, merges
|
||||
3, merges
|
||||
.filter("relclass == 'isMergedIn'")
|
||||
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
|
||||
.distinct()
|
||||
|
@ -180,7 +178,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
|||
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||
|
||||
assertEquals(4, roots.count());
|
||||
assertEquals(3, roots.count());
|
||||
|
||||
final Dataset<Publication> pubs = spark
|
||||
.read()
|
||||
|
@ -197,7 +195,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
|||
.collectAsList()
|
||||
.get(0);
|
||||
|
||||
assertEquals("2022-01-01", root.getDateofacceptance().getValue());
|
||||
assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue());
|
||||
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
||||
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
|
||||
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||
|
|
|
@ -168,7 +168,7 @@ public class SparkStatsTest implements Serializable {
|
|||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
||||
.count();
|
||||
|
||||
assertEquals(412, orgs_blocks);
|
||||
assertEquals(414, orgs_blocks);
|
||||
assertEquals(221, pubs_blocks);
|
||||
assertEquals(134, sw_blocks);
|
||||
assertEquals(196, ds_blocks);
|
||||
|
|
|
@ -73,6 +73,12 @@
|
|||
"name": "Irish Nephrology Society",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100011062",
|
||||
"uri": "http://dx.doi.org/10.13039/100011062",
|
||||
"name": "Asian Spinal Cord Network",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100011096",
|
||||
"uri": "http://dx.doi.org/10.13039/100011096",
|
||||
|
@ -217,6 +223,12 @@
|
|||
"name": "Global Brain Health Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015776",
|
||||
"uri": "http://dx.doi.org/10.13039/100015776",
|
||||
"name": "Health and Social Care Board",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015992",
|
||||
"uri": "http://dx.doi.org/10.13039/100015992",
|
||||
|
@ -391,6 +403,18 @@
|
|||
"name": "Irish Hospice Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001596",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001596",
|
||||
"name": "Irish Research Council for Science, Engineering and Technology",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001597",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001597",
|
||||
"name": "Irish Research Council for the Humanities and Social Sciences",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001598",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001598",
|
||||
|
@ -491,7 +515,7 @@
|
|||
"id": "501100002081",
|
||||
"uri": "http://dx.doi.org/10.13039/501100002081",
|
||||
"name": "Irish Research Council",
|
||||
"synonym": ["501100001596", "501100001597"]
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100002736",
|
||||
|
|
|
@ -560,15 +560,7 @@ case object Crossref2Oaf {
|
|||
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
|
||||
"10.13039/501100013589" | "10.13039/501100000271" =>
|
||||
generateSimpleRelationFromAward(funder, "ukri________", a => a)
|
||||
//HFRI
|
||||
case "10.13039/501100013209" =>
|
||||
generateSimpleRelationFromAward(funder, "hfri________", a => a)
|
||||
val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
//ERASMUS+
|
||||
case "10.13039/501100010790" =>
|
||||
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
||||
|
||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||
|
||||
}
|
||||
|
|
|
@ -313,7 +313,7 @@ case object ConversionUtil {
|
|||
if (f.author.DisplayName.isDefined)
|
||||
a.setFullname(f.author.DisplayName.get)
|
||||
if (f.affiliation != null)
|
||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||
a.setRawAffiliationString(List(f.affiliation).asJava)
|
||||
a.setPid(
|
||||
List(
|
||||
createSP(
|
||||
|
@ -386,7 +386,7 @@ case object ConversionUtil {
|
|||
a.setFullname(f.author.DisplayName.get)
|
||||
|
||||
if (f.affiliation != null)
|
||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||
a.setRawAffiliationString(List(f.affiliation).asJava)
|
||||
|
||||
a.setPid(
|
||||
List(
|
||||
|
|
|
@ -13,13 +13,13 @@ public class CommunityContentprovider {
|
|||
private String openaireId;
|
||||
private SelectionConstraints selectioncriteria;
|
||||
|
||||
private Boolean enabled;
|
||||
private String enabled;
|
||||
|
||||
public Boolean getEnabled() {
|
||||
public String getEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public void setEnabled(Boolean enabled) {
|
||||
public void setEnabled(String enabled) {
|
||||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
|
|
|
@ -53,8 +53,6 @@ public class Constraints implements Serializable {
|
|||
|
||||
for (Constraint sc : constraint) {
|
||||
boolean verified = false;
|
||||
if (!param.containsKey(sc.getField()))
|
||||
return false;
|
||||
for (String value : param.get(sc.getField())) {
|
||||
if (sc.verifyCriteria(value.trim())) {
|
||||
verified = true;
|
||||
|
|
|
@ -14,7 +14,6 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -85,12 +84,11 @@ public class SparkCountryPropagationJob {
|
|||
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
|
||||
|
||||
log.info("Reading prepared info: {}", preparedInfoPath);
|
||||
final Dataset<Row> preparedInfoRaw = spark
|
||||
Dataset<ResultCountrySet> prepared = spark
|
||||
.read()
|
||||
.json(preparedInfoPath);
|
||||
.json(preparedInfoPath)
|
||||
.as(Encoders.bean(ResultCountrySet.class));
|
||||
|
||||
if (!preparedInfoRaw.isEmpty()) {
|
||||
final Dataset<ResultCountrySet> prepared = preparedInfoRaw.as(Encoders.bean(ResultCountrySet.class));
|
||||
res
|
||||
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
|
||||
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
|
||||
|
@ -98,13 +96,7 @@ public class SparkCountryPropagationJob {
|
|||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
} else {
|
||||
res
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -147,7 +147,6 @@ public class CleanGraphSparkJob {
|
|||
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) GraphCleaningFunctions::dedicatedUglyHacks, Encoders.bean(clazz))
|
||||
.filter((FilterFunction<T>) GraphCleaningFunctions::filter);
|
||||
|
||||
// read the master-duplicate tuples
|
||||
|
|
|
@ -9,10 +9,7 @@ import java.util.Optional;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -25,8 +22,6 @@ public class GraphHiveTableImporterJob {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -74,7 +69,12 @@ public class GraphHiveTableImporterJob {
|
|||
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
|
||||
Class<T> clazz, int numPartitions) {
|
||||
|
||||
Dataset<String> dataset = spark.read().textFile(inputPath);
|
||||
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
|
||||
|
||||
Dataset<Row> dataset = spark
|
||||
.read()
|
||||
.schema(clazzEncoder.schema())
|
||||
.json(inputPath);
|
||||
|
||||
if (numPartitions > 0) {
|
||||
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
||||
|
@ -82,7 +82,6 @@ public class GraphHiveTableImporterJob {
|
|||
}
|
||||
|
||||
dataset
|
||||
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.saveAsTable(tableIdentifier(hiveDbName, clazz));
|
||||
|
|
|
@ -94,7 +94,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
|
||||
}
|
||||
|
||||
author.setAffiliation(prepareListFields(n, "./*[local-name()='affiliation']", info));
|
||||
author.setRawAffiliationString(prepareListString(n, "./*[local-name()='affiliation']"));
|
||||
author.setPid(preparePids(n, info));
|
||||
author.setRank(pos++);
|
||||
res.add(author);
|
||||
|
|
|
@ -223,13 +223,11 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
--conf spark.sql.shuffle.partitions=15000
|
||||
</spark-opts>
|
||||
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
|
@ -255,13 +253,11 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
--conf spark.sql.shuffle.partitions=15000
|
||||
</spark-opts>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
|
@ -282,7 +278,6 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -73,14 +73,10 @@ public class GraphHiveImporterJobTest {
|
|||
GraphHiveImporterJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
|
||||
"-hiveMetastoreUris",
|
||||
"",
|
||||
"-hiveDbName",
|
||||
dbName
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
|
||||
"--hiveMetastoreUris", "",
|
||||
"--hiveDbName", dbName
|
||||
});
|
||||
|
||||
ModelSupport.oafTypes
|
||||
|
|
|
@ -406,15 +406,15 @@ class MappersTest {
|
|||
assertEquals("Baracchini", author.get().getSurname());
|
||||
assertEquals("Theo", author.get().getName());
|
||||
|
||||
assertEquals(1, author.get().getAffiliation().size());
|
||||
final Optional<Field<String>> opAff = author
|
||||
assertEquals(1, author.get().getRawAffiliationString().size());
|
||||
final Optional<String> opAff = author
|
||||
.get()
|
||||
.getAffiliation()
|
||||
.getRawAffiliationString()
|
||||
.stream()
|
||||
.findFirst();
|
||||
assertTrue(opAff.isPresent());
|
||||
final Field<String> affiliation = opAff.get();
|
||||
assertEquals("ISTI-CNR", affiliation.getValue());
|
||||
final String affiliation = opAff.get();
|
||||
assertEquals("ISTI-CNR", affiliation);
|
||||
|
||||
assertFalse(d.getSubject().isEmpty());
|
||||
assertFalse(d.getInstance().isEmpty());
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -31,5 +31,11 @@ class ORCIDAuthorMatchersTest {
|
|||
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
||||
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
||||
}
|
||||
|
||||
@Test def testDocumentationNames(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
||||
}
|
||||
|
||||
@Test def testDocumentationNames2(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller Jones"))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -69,7 +69,7 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="irish_oaiphm_provision"/>
|
||||
<start to="oaiphm_provision"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
|
|
|
@ -67,7 +67,7 @@ public class PrepareRelationsJobTest {
|
|||
@Test
|
||||
void testRunPrepareRelationsJob(@TempDir Path testPath) throws Exception {
|
||||
|
||||
final int maxRelations = 5;
|
||||
final int maxRelations = 20;
|
||||
PrepareRelationsJob
|
||||
.main(
|
||||
new String[] {
|
||||
|
@ -86,7 +86,7 @@ public class PrepareRelationsJobTest {
|
|||
.as(Encoders.bean(Relation.class))
|
||||
.cache();
|
||||
|
||||
assertEquals(44, out.count());
|
||||
assertEquals(maxRelations, out.count());
|
||||
|
||||
Dataset<Row> freq = out
|
||||
.toDF()
|
||||
|
@ -101,8 +101,12 @@ public class PrepareRelationsJobTest {
|
|||
long affiliation = getRows(freq, AFFILIATION).get(0).getAs("count");
|
||||
|
||||
assertEquals(outcome, participation);
|
||||
assertEquals(outcome, affiliation);
|
||||
assertEquals(4, affiliation);
|
||||
assertTrue(outcome > affiliation);
|
||||
assertTrue(participation > affiliation);
|
||||
|
||||
assertEquals(7, outcome);
|
||||
assertEquals(7, participation);
|
||||
assertEquals(6, affiliation);
|
||||
}
|
||||
|
||||
protected List<Row> getRows(Dataset<Row> freq, String col) {
|
||||
|
|
|
@ -91,6 +91,9 @@ class SolrRecordDumpJobTest {
|
|||
public void prepareMocks() throws ISLookUpException, IOException {
|
||||
isLookupClient.setIsLookup(isLookUpService);
|
||||
|
||||
Mockito
|
||||
.when(isLookupClient.getDsId(Mockito.anyString()))
|
||||
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
|
||||
Mockito
|
||||
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
|
||||
.thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
|
||||
|
|
|
@ -48,25 +48,16 @@
|
|||
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
|
||||
<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
|
||||
<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
|
||||
<case to="clear-working-dir">${wf:conf('resume') eq "start"}</case>
|
||||
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
|
||||
|
||||
<!-- Aggregation of impact scores on the project level -->
|
||||
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
|
||||
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
|
||||
|
||||
<default to="clear-working-dir" />
|
||||
<default to="create-openaire-ranking-graph" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="clear-working-dir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="create-openaire-ranking-graph"/>
|
||||
<error to="clear-working-dir-fail"/>
|
||||
</action>
|
||||
|
||||
<!-- initial step: create citation network -->
|
||||
<action name="create-openaire-ranking-graph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
@ -627,10 +618,6 @@
|
|||
<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<kill name="clear-working-dir-fail">
|
||||
<message>Re-create working dir failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<!-- Define ending node -->
|
||||
<end name="end" />
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ select distinct * from (
|
|||
from SOURCE.result r
|
||||
join SOURCE.result_projects rp on rp.id=r.id
|
||||
join SOURCE.project p on p.id=rp.project
|
||||
join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder
|
||||
join TARGET.irish_funders irf on irf.funder=p.funder
|
||||
union all
|
||||
select r.*
|
||||
from SOURCE.result r
|
||||
|
|
|
@ -1,79 +1,3 @@
|
|||
--drop database if exists TARGET cascade;
|
||||
--create database if not exists TARGET;
|
||||
--
|
||||
--create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
--create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
--create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
--create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
--create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
--
|
||||
--create table TARGET.result stored as parquet as
|
||||
-- select distinct * from (
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||
-- union all
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||
-- union all
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
|
||||
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
|
||||
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
|
||||
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
|
||||
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
|
||||
-- ) )) foo;
|
||||
--
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
|
|
|
@ -81,7 +81,17 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
||||
))) foo;
|
||||
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
|
|
@ -61,7 +61,17 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
||||
))) foo;
|
||||
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
|
@ -0,0 +1,18 @@
|
|||
# Install the whole "dnet-hadoop" project.
|
||||
|
||||
# Delete this module's previous build-files in order to avoid any conflicts.
|
||||
rm -rf target/ ||
|
||||
|
||||
# Go to the root directory of this project.
|
||||
cd ../../
|
||||
|
||||
# Select the build profile.
|
||||
DEFAULT_PROFILE='' # It's the empty profile.
|
||||
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
|
||||
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
|
||||
|
||||
# Install the project.
|
||||
mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true
|
||||
|
||||
# We skip tests for all modules, since the take a big amount of time and some of them fail.
|
||||
# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script.
|
|
@ -0,0 +1,20 @@
|
|||
# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file.
|
||||
|
||||
# Select the build profile.
|
||||
DEFAULT_PROFILE='' # It's the empty profile.
|
||||
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
|
||||
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
|
||||
|
||||
# Build and deploy this module.
|
||||
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
|
||||
-Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats
|
||||
|
||||
# Show the Oozie-job-ID.
|
||||
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
|
||||
cat ./target/extract-and-run-on-remote-host.log
|
||||
|
||||
# Check oozie workflow status
|
||||
# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info <workflow-ID>
|
||||
|
||||
# Get the <job-ID> from the previous output and check the logs:
|
||||
# yarn logs -applicationId application_<job-ID>
|
|
@ -1,8 +1,10 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- Stats database creation
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
|
||||
DROP database IF EXISTS ${stats_db_name} CASCADE;
|
||||
CREATE database ${stats_db_name};
|
||||
DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/
|
||||
CREATE database ${stats_db_name}; /*EOS*/
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
|
||||
|
@ -5,27 +7,27 @@
|
|||
------------------------------------------------------------------------------------------------
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.fundref;
|
||||
FROM ${external_stats_db_name}.fundref; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.country AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.country;
|
||||
FROM ${external_stats_db_name}.country; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.countrygdp;
|
||||
FROM ${external_stats_db_name}.countrygdp; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.roarmap;
|
||||
FROM ${external_stats_db_name}.roarmap; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.rndexpediture;
|
||||
FROM ${external_stats_db_name}.rndexpediture; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.licenses_normalized;
|
||||
FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
|
||||
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
|
@ -33,23 +35,23 @@ FROM ${external_stats_db_name}.licenses_normalized;
|
|||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
create or replace view ${stats_db_name}.usage_stats as
|
||||
select * from openaire_prod_usage_stats.usage_stats;
|
||||
select * from openaire_prod_usage_stats.usage_stats; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.downloads_stats as
|
||||
select * from openaire_prod_usage_stats.downloads_stats;
|
||||
select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.pageviews_stats as
|
||||
select * from openaire_prod_usage_stats.pageviews_stats;
|
||||
select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.views_stats as
|
||||
select * from openaire_prod_usage_stats.views_stats;
|
||||
select * from openaire_prod_usage_stats.views_stats; /*EOS*/
|
||||
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
-- Creation date of the database
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.creation_date STORED AS PARQUET as
|
||||
select date_format(current_date(), 'dd-MM-yyyy') as date;
|
||||
select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/
|
||||
|
|
|
@ -1,110 +1,11 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
-- Post processing - Updates on main tables
|
||||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
|
||||
--Datasource temporary table updates
|
||||
UPDATE ${stats_db_name}.datasource_tmp
|
||||
SET harvested='true'
|
||||
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
|
||||
FROM ${stats_db_name}.datasource_tmp d,
|
||||
${stats_db_name}.result_datasources rd
|
||||
WHERE d.id = rd.datasource);
|
||||
|
||||
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
|
||||
UPDATE ${stats_db_name}.project_tmp
|
||||
SET haspubs='yes'
|
||||
WHERE project_tmp.id IN (SELECT pr.id
|
||||
FROM ${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.result r
|
||||
WHERE pr.result = r.id
|
||||
AND r.type = 'publication');
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.stored purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.project stored as parquet as
|
||||
SELECT p.id,
|
||||
p.acronym,
|
||||
p.title,
|
||||
p.funder,
|
||||
p.funding_lvl0,
|
||||
p.funding_lvl1,
|
||||
p.funding_lvl2,
|
||||
p.ec39,
|
||||
p.type,
|
||||
p.startdate,
|
||||
p.enddate,
|
||||
p.start_year,
|
||||
p.end_year,
|
||||
p.duration,
|
||||
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
|
||||
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
|
||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
|
||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
|
||||
p.callidentifier,
|
||||
p.code,
|
||||
p.totalcost,
|
||||
p.fundedamount,
|
||||
p.currency
|
||||
FROM ${stats_db_name}.project_tmp p
|
||||
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
|
||||
FROM ${stats_db_name}.project_results pr
|
||||
INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
|
||||
WHERE r.type = 'publication'
|
||||
GROUP BY pr.id) AS prr1 on prr1.id = p.id
|
||||
LEFT JOIN (SELECT pp.id,
|
||||
max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
|
||||
count(distinct r.id) AS dp
|
||||
FROM ${stats_db_name}.project_tmp pp,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.result r
|
||||
WHERE pp.id = pr.id
|
||||
AND pr.result = r.id
|
||||
AND r.type = 'publication'
|
||||
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
|
||||
GROUP BY pp.id) AS prr2
|
||||
ON prr2.id = p.id;
|
||||
|
||||
UPDATE ${stats_db_name}.publication_tmp
|
||||
SET delayed = 'yes'
|
||||
WHERE publication_tmp.id IN (SELECT distinct r.id
|
||||
FROM ${stats_db_name}.result r,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.project_tmp p
|
||||
WHERE r.id = pr.result
|
||||
AND pr.id = p.id
|
||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||
|
||||
UPDATE ${stats_db_name}.dataset_tmp
|
||||
SET delayed = 'yes'
|
||||
WHERE dataset_tmp.id IN (SELECT distinct r.id
|
||||
FROM ${stats_db_name}.result r,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.project_tmp p
|
||||
WHERE r.id = pr.result
|
||||
AND pr.id = p.id
|
||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||
|
||||
UPDATE ${stats_db_name}.software_tmp
|
||||
SET delayed = 'yes'
|
||||
WHERE software_tmp.id IN (SELECT distinct r.id
|
||||
FROM ${stats_db_name}.result r,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.project_tmp p
|
||||
WHERE r.id = pr.result
|
||||
AND pr.id = p.id
|
||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||
|
||||
UPDATE ${stats_db_name}.otherresearchproduct_tmp
|
||||
SET delayed = 'yes'
|
||||
WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
|
||||
FROM ${stats_db_name}.result r,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.project_tmp p
|
||||
WHERE r.id = pr.result
|
||||
AND pr.id = p.id
|
||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
|
||||
SELECT result_projects.id AS result,
|
||||
result_projects.project AS project_results,
|
||||
|
@ -116,4 +17,4 @@ FROM ${stats_db_name}.result_projects,
|
|||
${stats_db_name}.project
|
||||
WHERE result_projects.id = result.id
|
||||
AND result.type = 'publication'
|
||||
AND project.id = result_projects.project;
|
||||
AND project.id = result_projects.project; /*EOS*/
|
|
@ -1,42 +1,4 @@
|
|||
------------------------------------------------------------------------------------------------------
|
||||
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
|
||||
------------------------------------------------------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.datasource_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.publication_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.dataset_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.software_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_tmp;
|
||||
|
||||
DROP TABLE ${stats_db_name}.project_tmp;
|
||||
DROP TABLE ${stats_db_name}.datasource_tmp;
|
||||
DROP TABLE ${stats_db_name}.publication_tmp;
|
||||
DROP TABLE ${stats_db_name}.dataset_tmp;
|
||||
DROP TABLE ${stats_db_name}.software_tmp;
|
||||
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
----------------------------------------------
|
||||
-- Re-creating views from final parquet tables
|
||||
|
@ -54,4 +16,4 @@ SELECT *, bestlicence AS access_mode
|
|||
FROM ${stats_db_name}.dataset
|
||||
UNION ALL
|
||||
SELECT *, bestlicence AS access_mode
|
||||
FROM ${stats_db_name}.otherresearchproduct;
|
||||
FROM ${stats_db_name}.otherresearchproduct; /*EOS*/
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
|
@ -5,10 +7,10 @@
|
|||
-- Sources related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
|
@ -16,12 +18,12 @@ LEFT OUTER JOIN
|
|||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
|
@ -29,12 +31,12 @@ LEFT OUTER JOIN
|
|||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
|
@ -42,12 +44,12 @@ LEFT OUTER JOIN
|
|||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
|
@ -55,7 +57,7 @@ LEFT OUTER JOIN
|
|||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
|
||||
SELECT * FROM ${stats_db_name}.publication_sources
|
||||
|
@ -64,24 +66,24 @@ SELECT * FROM ${stats_db_name}.dataset_sources
|
|||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.software_sources
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
|
||||
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
|
||||
select /*+ COALESCE(100) */ distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
|
||||
from (
|
||||
SELECT substr(res.id, 4) as id, auth_pid.value as orcid
|
||||
FROM ${openaire_db_name}.result res
|
||||
LATERAL VIEW explode(author) a as auth
|
||||
LATERAL VIEW explode(auth.pid) ap as auth_pid
|
||||
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
|
||||
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res;
|
||||
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
|
||||
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
|
||||
select /*+ COALESCE(100) */ substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||
|
@ -91,12 +93,12 @@ where reltype='resultResult'
|
|||
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
|
||||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
|
||||
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
||||
select /*+ COALESCE(100) */ substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||
|
@ -108,12 +110,12 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
|
|||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||
group by substr(target, 4);
|
||||
group by substr(target, 4); /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
|
||||
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
||||
select /*+ COALESCE(100) */ substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||
|
@ -125,4 +127,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
|
|||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||
group by substr(source, 4);
|
||||
group by substr(source, 4); /*EOS*/
|
|
@ -1,4 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
|
@ -6,33 +7,33 @@ set mapred.job.queue.name=analytics;
|
|||
-- Licences related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
|
||||
SELECT * FROM ${stats_db_name}.publication_licenses
|
||||
|
@ -41,29 +42,29 @@ SELECT * FROM ${stats_db_name}.dataset_licenses
|
|||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.software_licenses
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
|
||||
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
|
||||
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
|
||||
select /*+ COALESCE(100) */ substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
|
||||
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
|
||||
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ o.id, case when d.id is null then 'other' else o.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
|
||||
from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o
|
||||
LEFT OUTER JOIN (
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
|
||||
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
|
||||
select /*+ COALESCE(100) */ distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
|
||||
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
|
||||
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE;
|
||||
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
@ -8,7 +8,7 @@ set mapred.job.queue.name=analytics;
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
|
@ -18,15 +18,15 @@ non_peer_reviewed as (
|
|||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
select /*+ COALESCE(100) */ distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
where peer_reviewed.id is null) pr; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
|
@ -36,15 +36,15 @@ non_peer_reviewed as (
|
|||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
select /*+ COALESCE(100) */ distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
where peer_reviewed.id is null) pr; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
|
@ -54,15 +54,15 @@ non_peer_reviewed as (
|
|||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
select /*+ COALESCE(100) */ distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
where peer_reviewed.id is null) pr; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
|
@ -72,13 +72,13 @@ non_peer_reviewed as (
|
|||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
select /*+ COALESCE(100) */ distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
where peer_reviewed.id is null) pr; /*EOS*/
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
|
||||
select * from ${stats_db_name}.publication_refereed
|
||||
|
@ -87,23 +87,23 @@ select * from ${stats_db_name}.dataset_refereed
|
|||
union all
|
||||
select * from ${stats_db_name}.software_refereed
|
||||
union all
|
||||
select * from ${stats_db_name}.otherresearchproduct_refereed;
|
||||
select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
|
||||
select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
|
||||
select /*+ COALESCE(100) */ substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
|
||||
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
|
||||
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
|
||||
where measures_ids.id!='views' and measures_ids.id!='downloads';
|
||||
where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
|
||||
select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
|
||||
select /*+ COALESCE(100) */ distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
|
||||
cast(rel.properties[0].value as double) apc_amount,
|
||||
rel.properties[1].value apc_currency
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.organization o on o.id=rel.source
|
||||
join ${openaire_db_name}.result r on r.id=rel.target
|
||||
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
|
||||
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/
|
||||
|
|
|
@ -1,27 +1,27 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
-------------------------------------------
|
||||
--- Extra tables, mostly used by indicators
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
|
||||
select r.id, count(distinct p.id) as count
|
||||
select /*+ COALESCE(100) */ r.id, count(distinct p.id) as count
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||
left outer join ${stats_db_name}.project p on p.id=rp.project
|
||||
group by r.id;
|
||||
group by r.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
|
||||
select r.id, count(distinct p.funder) as count
|
||||
select /*+ COALESCE(100) */ r.id, count(distinct p.funder) as count
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||
left outer join ${stats_db_name}.project p on p.id=rp.project
|
||||
group by r.id;
|
||||
group by r.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
|
||||
with rcount as (
|
||||
|
@ -30,39 +30,39 @@ with rcount as (
|
|||
left outer join ${stats_db_name}.result_projects rp on rp.project=p.id
|
||||
left outer join ${stats_db_name}.result r on r.id=rp.id
|
||||
group by r.type, p.id )
|
||||
select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
|
||||
select /*+ COALESCE(100) */ rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
|
||||
sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets,
|
||||
sum(case when rcount.type='software' then rcount.count else 0 end) as software,
|
||||
sum(case when rcount.type='other' then rcount.count else 0 end) as other
|
||||
from rcount
|
||||
group by rcount.pid;
|
||||
group by rcount.pid; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
|
||||
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
|
||||
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
|
||||
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
|
||||
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
|
||||
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
|
||||
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates;
|
||||
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/
|
||||
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/
|
||||
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/
|
||||
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/
|
||||
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/
|
||||
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/
|
||||
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_instance stored as parquet as
|
||||
select distinct r.*
|
||||
select /*+ COALESCE(100) */ distinct r.*
|
||||
from (
|
||||
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
||||
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
|
||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
|
||||
join ${stats_db_name}.result res on res.id=r.id;
|
||||
join ${stats_db_name}.result res on res.id=r.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
|
||||
select distinct r.id, r.amount, r.currency
|
||||
select /*+ COALESCE(100) */ distinct r.id, r.amount, r.currency
|
||||
from (
|
||||
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
|
||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
|
||||
join ${stats_db_name}.result res on res.id=r.id
|
||||
where r.amount is not null;
|
||||
where r.amount is not null; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;
|
||||
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/
|
|
@ -1,7 +1,7 @@
|
|||
-- Sprint 1 ----
|
||||
drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
|
||||
select distinct p.id, coalesce(green_oa, 0) as green_oa
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(green_oa, 0) as green_oa
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select p.id, 1 as green_oa
|
||||
|
@ -12,7 +12,7 @@ left outer join (
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as
|
||||
select distinct p.id, coalesce(grey_lit, 0) as grey_lit
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(grey_lit, 0) as grey_lit
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select p.id, 1 as grey_lit
|
||||
|
@ -23,7 +23,7 @@ left outer join (
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as
|
||||
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri
|
||||
|
@ -33,7 +33,7 @@ left outer join (
|
|||
-- Sprint 2 ----
|
||||
drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as
|
||||
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
|
||||
select /*+ COALESCE(100) */ distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select r.id, license.type as lic from ${stats_db_name}.result r
|
||||
|
@ -42,7 +42,7 @@ left outer join (
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as
|
||||
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
|
||||
select /*+ COALESCE(100) */ distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select r.id, lower(parse_url(license.type, "HOST")) as lic_host
|
||||
|
@ -52,12 +52,12 @@ left outer join (
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as
|
||||
select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
|
||||
select /*+ COALESCE(100) */ distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
|
||||
from ${stats_db_name}.publication; /*EOS*/
|
||||
|
||||
drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as
|
||||
select distinct r.id, coalesce(has_orcid, 0) as has_orcid
|
||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(has_orcid, 0) as has_orcid
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/
|
||||
|
@ -66,7 +66,7 @@ left outer join (
|
|||
---- Sprint 3 ----
|
||||
drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as
|
||||
select distinct r.result as id, coalesce(fundref, 0) as fundref
|
||||
select /*+ COALESCE(100) */ distinct r.result as id, coalesce(fundref, 0) as fundref
|
||||
from ${stats_db_name}.project_results r
|
||||
left outer join (
|
||||
select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/
|
||||
|
@ -77,7 +77,7 @@ create table if not exists ${stats_db_name}.indi_result_org_collab stored as par
|
|||
SELECT ro.organization organization, ro.id, o.name
|
||||
from ${stats_db_name}.result_organization ro
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null)
|
||||
select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
|
||||
group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/
|
||||
|
@ -89,7 +89,7 @@ create table if not exists ${stats_db_name}.indi_result_org_country_collab store
|
|||
from ${stats_db_name}.result_organization ro
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
where country <> 'UNKNOWN' and o.name is not null)
|
||||
select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
|
||||
from tmp as o1 join tmp as o2 on o1.id=o2.id
|
||||
where o1.id=o2.id and o1.country!=o2.country
|
||||
group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/
|
||||
|
@ -100,7 +100,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org stored as pa
|
|||
select o.id organization, o.name, ro.project as project
|
||||
from ${stats_db_name}.organization o
|
||||
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null)
|
||||
select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.project=o2.project
|
||||
where o1.organization<>o2.organization and o1.name<>o2.name
|
||||
|
@ -112,7 +112,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org_country stor
|
|||
select o.id organization, o.name, o.country , ro.project as project
|
||||
from ${stats_db_name}.organization o
|
||||
join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null)
|
||||
select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.project=o2.project
|
||||
where o1.organization<>o2.organization and o1.country<>o2.country
|
||||
|
@ -124,7 +124,7 @@ create table if not exists ${stats_db_name}.indi_funder_country_collab stored as
|
|||
join ${stats_db_name}.organization o on o.id=op.id
|
||||
join ${stats_db_name}.project p on p.id=op.project
|
||||
where country <> 'UNKNOWN')
|
||||
select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
|
||||
select /*+ COALESCE(100) */ f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
|
||||
from tmp as f1
|
||||
join tmp as f2 on f1.project=f2.project
|
||||
where f1.country<>f2.country
|
||||
|
@ -136,7 +136,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
|
|||
select distinct country, ro.id as result from ${stats_db_name}.organization o
|
||||
join ${stats_db_name}.result_organization ro on o.id=ro.organization
|
||||
where country <> 'UNKNOWN' and o.name is not null)
|
||||
select o1.country country1, o2.country country2, count(o1.result) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.country country1, o2.country country2, count(o1.result) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.result=o2.result
|
||||
where o1.country<>o2.country
|
||||
|
@ -146,7 +146,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
|
|||
---- Sprint 4 ----
|
||||
drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as
|
||||
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
|
||||
select /*+ COALESCE(100) */ distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
|
||||
from ${stats_db_name}.publication_datasources pd
|
||||
left outer join (
|
||||
select pd.id, 1 as in_diamond_journal
|
||||
|
@ -157,7 +157,7 @@ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet a
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as
|
||||
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
|
||||
select /*+ COALESCE(100) */ distinct pd.id, coalesce(is_transformative, 0) as is_transformative
|
||||
from ${stats_db_name}.publication pd
|
||||
left outer join (
|
||||
select pd.id, 1 as is_transformative
|
||||
|
@ -168,7 +168,7 @@ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as
|
||||
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
|
||||
select /*+ COALESCE(100) */ distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
|
||||
from ${stats_db_name}.result_instance ri
|
||||
left outer join (
|
||||
select ri.id, 1 as pub_closed_other_open
|
||||
|
@ -182,14 +182,14 @@ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as
|
|||
---- Sprint 5 ----
|
||||
drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as
|
||||
select id, count(id) as number_of_copies
|
||||
select /*+ COALESCE(100) */ id, count(id) as number_of_copies
|
||||
from ${stats_db_name}.result_instance
|
||||
group by id; /*EOS*/
|
||||
|
||||
---- Sprint 6 ----
|
||||
drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as
|
||||
SELECT result_id, sum(downloads) no_downloads
|
||||
SELECT /*+ COALESCE(100) */ result_id, sum(downloads) no_downloads
|
||||
from openaire_prod_usage_stats.usage_stats
|
||||
join ${stats_db_name}.publication on result_id=id
|
||||
where downloads>0
|
||||
|
@ -197,7 +197,7 @@ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as
|
||||
SELECT result_id, repository_id, sum(downloads) no_downloads
|
||||
SELECT /*+ COALESCE(100) */ result_id, repository_id, sum(downloads) no_downloads
|
||||
from openaire_prod_usage_stats.usage_stats
|
||||
join ${stats_db_name}.publication on result_id=id
|
||||
where downloads>0
|
||||
|
@ -205,14 +205,14 @@ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as
|
||||
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
|
||||
SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
|
||||
from openaire_prod_usage_stats.usage_stats us
|
||||
join ${stats_db_name}.publication on result_id=id where downloads>0
|
||||
GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/
|
||||
|
||||
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as
|
||||
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
|
||||
SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
|
||||
from openaire_prod_usage_stats.usage_stats us
|
||||
join ${stats_db_name}.publication on result_id=id
|
||||
where downloads>0
|
||||
|
@ -241,7 +241,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
|
|||
UNION ALL
|
||||
select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo
|
||||
)
|
||||
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
|
||||
SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
|
||||
FROM ${stats_db_name}.publication pd
|
||||
left outer join (
|
||||
select pd.id, 1 as is_gold
|
||||
|
@ -272,7 +272,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
|
|||
FROM ${stats_db_name}.datasource
|
||||
WHERE issn_online IS NOT NULL ) as issn
|
||||
WHERE LENGTH(issn) > 7)
|
||||
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
|
||||
SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
|
||||
FROM ${stats_db_name}.publication_datasources pd
|
||||
LEFT OUTER JOIN (
|
||||
SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd
|
||||
|
@ -284,7 +284,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as
|
||||
select distinct p.id, coalesce(is_hybrid, 0) is_hybrid
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(is_hybrid, 0) is_hybrid
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select p.id, 1 as is_hybrid
|
||||
|
@ -313,7 +313,7 @@ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet
|
|||
where cast(year as int)>2003
|
||||
group by ro.organization)
|
||||
--return results_fair/all_results
|
||||
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.organization=allresults.organization; /*EOS*/
|
||||
|
||||
|
@ -336,7 +336,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
|
|||
drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as
|
||||
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
from allresults ar
|
||||
join result_fair rf on rf.organization=ar.organization; /*EOS*/
|
||||
|
||||
|
@ -357,7 +357,7 @@ CREATE TEMPORARY VIEW allresults as select year, ro.organization, count(distinct
|
|||
drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as
|
||||
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/
|
||||
|
||||
|
@ -381,7 +381,7 @@ CREATE TEMPORARY VIEW allresults as
|
|||
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
|
||||
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
from allresults ar join result_fair rf
|
||||
on rf.organization=ar.organization; /*EOS*/
|
||||
|
||||
|
@ -404,7 +404,7 @@ CREATE TEMPORARY VIEW allresults as
|
|||
drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as
|
||||
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/
|
||||
|
||||
|
@ -427,7 +427,7 @@ CREATE TEMPORARY VIEW allresults as
|
|||
drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as
|
||||
select cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
from allresults
|
||||
join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/
|
||||
|
||||
|
@ -450,7 +450,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
|
|||
drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as
|
||||
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
select /*+ COALESCE(100) */ allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
from allresults
|
||||
join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/
|
||||
|
||||
|
@ -516,7 +516,7 @@ select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsof
|
|||
drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as
|
||||
select allpubsshare.organization,
|
||||
select /*+ COALESCE(100) */ allpubsshare.organization,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end))
|
||||
org_openess FROM allpubsshare
|
||||
|
@ -593,7 +593,7 @@ select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/all
|
|||
drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as
|
||||
select cast(allpubsshare.year as int) year, allpubsshare.organization,
|
||||
select /*+ COALESCE(100) */ cast(allpubsshare.year as int) year, allpubsshare.organization,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end))
|
||||
org_openess FROM allpubsshare
|
||||
|
@ -617,7 +617,7 @@ DROP VIEW allsoftwaresshare; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as
|
||||
select distinct p.id, coalesce(has_preprint, 0) as has_preprint
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(has_preprint, 0) as has_preprint
|
||||
from ${stats_db_name}.publication_classifications p
|
||||
left outer join (
|
||||
select p.id, 1 as has_preprint
|
||||
|
@ -627,7 +627,7 @@ from ${stats_db_name}.publication_classifications p
|
|||
drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as
|
||||
select distinct p.id, coalesce(is_subscription, 0) as is_subscription
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(is_subscription, 0) as is_subscription
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join(
|
||||
select p.id, 1 as is_subscription from ${stats_db_name}.publication p
|
||||
|
@ -640,7 +640,7 @@ from ${stats_db_name}.publication p
|
|||
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
|
||||
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
||||
from ${stats_db_name}.result p
|
||||
left outer join (
|
||||
select p.id, 1 as result_with_pid
|
||||
|
@ -654,7 +654,7 @@ group by rf.id; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
|
||||
select distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
||||
select /*+ COALESCE(100) */ distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
||||
as is_interdisciplinary
|
||||
from pub_fos_totals p
|
||||
left outer join (
|
||||
|
@ -666,7 +666,7 @@ drop view pub_fos_totals; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as
|
||||
select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
|
||||
select /*+ COALESCE(100) */ distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select p.id, 1 as is_bronze_oa
|
||||
|
@ -689,7 +689,7 @@ where p.end_year is NOT NULL and r.year is not null; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as
|
||||
select pry.project_id, pry.acronym, pry.result_id,
|
||||
select /*+ COALESCE(100) */ pry.project_id, pry.acronym, pry.result_id,
|
||||
coalesce(is_project_result_after, 0) as is_project_result_after
|
||||
from project_year_result_year pry
|
||||
left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after
|
||||
|
@ -701,7 +701,7 @@ drop view project_year_result_year; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as
|
||||
select distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
|
||||
select /*+ COALESCE(100) */ distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
|
||||
from ${stats_db_name}.funder f
|
||||
left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder
|
||||
join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp
|
||||
|
@ -722,7 +722,7 @@ create table if not exists ${stats_db_name}.indi_funder_fairness stored as parqu
|
|||
join ${stats_db_name}.project p on p.id=rp.project
|
||||
where cast(year as int)>2003
|
||||
group by p.funder)
|
||||
select allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
|
||||
select /*+ COALESCE(100) */ allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.funder=allresults.funder; /*EOS*/
|
||||
|
||||
|
@ -745,7 +745,7 @@ allresults as
|
|||
join ${stats_db_name}.result r on r.id=rc.id
|
||||
where cast(year as int)>2003
|
||||
group by rc.ri_initiative)
|
||||
select allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
|
||||
select /*+ COALESCE(100) */ allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/
|
||||
|
||||
|
@ -817,15 +817,13 @@ select software_oa.funder, software_oa.no_oasoftware/allsoftware.no_allsoftware
|
|||
drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as
|
||||
select allpubsshare.funder,
|
||||
select /*+ COALESCE(100) */ allpubsshare.funder,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end))
|
||||
funder_openess FROM allpubsshare
|
||||
left outer join (select funder,d from
|
||||
alldatasetssshare) tmp1
|
||||
+(case when d is null then 0 else 1 end)) funder_openess
|
||||
FROM allpubsshare
|
||||
left outer join (select funder,d from alldatasetssshare) tmp1
|
||||
on tmp1.funder=allpubsshare.funder
|
||||
left outer join (select funder,s from
|
||||
allsoftwaresshare) tmp2
|
||||
left outer join (select funder,s from allsoftwaresshare) tmp2
|
||||
on tmp2.funder=allpubsshare.funder; /*EOS*/
|
||||
|
||||
DROP VIEW pubs_oa; /*EOS*/
|
||||
|
@ -905,7 +903,7 @@ select software_oa.ri_initiative, software_oa.no_oasoftware/allsoftware.no_allso
|
|||
drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as
|
||||
select allpubsshare.ri_initiative,
|
||||
select /*+ COALESCE(100) */ allpubsshare.ri_initiative,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end))
|
||||
ris_openess FROM allpubsshare
|
||||
|
@ -943,7 +941,7 @@ with result_findable as
|
|||
join ${stats_db_name}.project p on p.id=rp.project
|
||||
where cast(year as int)>2003
|
||||
group by p.funder)
|
||||
select allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
|
||||
select /*+ COALESCE(100) */ allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
|
||||
from allresults
|
||||
join result_findable on result_findable.funder=allresults.funder; /*EOS*/
|
||||
|
||||
|
@ -967,10 +965,12 @@ allresults as
|
|||
join ${stats_db_name}.result r on r.id=rc.id
|
||||
where cast(r.year as int)>2003
|
||||
group by rc.ri_initiative)
|
||||
select allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
|
||||
select /*+ COALESCE(100) */ allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
|
||||
from allresults
|
||||
join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/
|
||||
|
||||
drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as
|
||||
with org_names_pids as
|
||||
(select org.id,name, pid from ${stats_db_name}.organization org
|
||||
|
@ -986,7 +986,7 @@ union all
|
|||
select op.name from stats_ext.insitutions_for_publicly_funded pf
|
||||
join org_names_pids op on (op.name=pf.name or op.pid=pf.ror)
|
||||
and pf.publicly_funded='yes') foo)
|
||||
select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro
|
||||
|
@ -995,7 +995,7 @@ join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; /*EOS*/
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/
|
||||
create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as
|
||||
select distinct p.id, coalesce(green_with_license, 0) as green_with_license
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(green_with_license, 0) as green_with_license
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p
|
||||
|
@ -1006,7 +1006,7 @@ left outer join (
|
|||
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.result_country stored as parquet as
|
||||
select distinct id, country
|
||||
select /*+ COALESCE(100) */ distinct id, country
|
||||
from (
|
||||
select ro.id, o.country
|
||||
from ${stats_db_name}.result_organization ro
|
||||
|
@ -1021,7 +1021,7 @@ where rc.country is not null; /*EOS*/
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
|
||||
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as
|
||||
select distinct r.id, coalesce(oa_with_license,0) as oa_with_license
|
||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_with_license,0) as oa_with_license
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/
|
||||
|
@ -1031,7 +1031,7 @@ create table ${stats_db_name}.indi_result_oa_without_license stored as parquet a
|
|||
with without_license as
|
||||
(select distinct id from ${stats_db_name}.indi_result_oa_with_license
|
||||
where oa_with_license=0)
|
||||
select distinct r.id, coalesce(oa_without_license,0) as oa_without_license
|
||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_without_license,0) as oa_without_license
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (select distinct r.id, 1 as oa_without_license
|
||||
from ${stats_db_name}.result r
|
||||
|
@ -1042,7 +1042,7 @@ drop table if exists ${stats_db_name}.indi_result_under_transformative purge; /*
|
|||
create table ${stats_db_name}.indi_result_under_transformative stored as parquet as
|
||||
with transformative_dois as (
|
||||
select distinct doi from stats_ext.transformative_facts)
|
||||
select distinct r.id, coalesce(under_transformative,0) as under_transformative
|
||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(under_transformative,0) as under_transformative
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select distinct rp.id, 1 as under_transformative
|
||||
|
|
|
@ -1,30 +1,30 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
----------------------------------------------------
|
||||
-- Shortcuts for various definitions in stats db ---
|
||||
----------------------------------------------------
|
||||
|
||||
-- Peer reviewed:
|
||||
drop table if exists ${stats_db_name}.result_peerreviewed purge;
|
||||
drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/
|
||||
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
|
||||
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
|
||||
select /*+ COALESCE(100) */ r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id;
|
||||
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/
|
||||
|
||||
-- Green OA:
|
||||
drop table if exists ${stats_db_name}.result_greenoa purge;
|
||||
drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/
|
||||
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
|
||||
select r.id, case when green.green_oa=1 then true else false end as green
|
||||
select /*+ COALESCE(100) */ r.id, case when green.green_oa=1 then true else false end as green
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id;
|
||||
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/
|
||||
|
||||
-- GOLD OA:
|
||||
drop table if exists ${stats_db_name}.result_gold purge;
|
||||
drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/
|
||||
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
|
||||
select r.id, case when gold.is_gold=1 then true else false end as gold
|
||||
select /*+ COALESCE(100) */ r.id, case when gold.is_gold=1 then true else false end as gold
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;
|
||||
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/
|
|
@ -1,58 +1,26 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||
-- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold,
|
||||
-- peer reviewed)
|
||||
drop table if exists ${stats_db_name}.result_tmp;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.result_tmp (
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
`date` STRING,
|
||||
`year` INT,
|
||||
bestlicence STRING,
|
||||
access_mode STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING ,
|
||||
peer_reviewed BOOLEAN,
|
||||
green BOOLEAN,
|
||||
gold BOOLEAN)
|
||||
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
drop view if exists ${stats_db_name}.result; /*EOS*/
|
||||
drop table if exists ${stats_db_name}.result; /*EOS*/
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.publication r
|
||||
CREATE TABLE ${stats_db_name}.result stored as parquet as
|
||||
SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM (
|
||||
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||
FROM ${stats_db_name}.publication)
|
||||
UNION ALL
|
||||
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||
FROM ${stats_db_name}.dataset)
|
||||
UNION ALL
|
||||
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||
FROM ${stats_db_name}.software)
|
||||
UNION ALL
|
||||
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||
FROM ${stats_db_name}.otherresearchproduct)
|
||||
) r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.dataset r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.software r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.otherresearchproduct r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
drop table if exists ${stats_db_name}.result;
|
||||
drop view if exists ${stats_db_name}.result;
|
||||
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
||||
drop table ${stats_db_name}.result_tmp;
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
|
@ -7,65 +7,65 @@ set mapred.job.queue.name=analytics;
|
|||
--------------------------------------------------------------
|
||||
|
||||
-- Publication temporary table
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge;
|
||||
CREATE TABLE ${stats_db_name}.publication_tmp
|
||||
(
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
date STRING,
|
||||
year STRING,
|
||||
bestlicence STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication stored as parquet as
|
||||
with pub_pr as (
|
||||
select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||
from ${openaire_db_name}.publication pub
|
||||
join ${openaire_db_name}.relation rel
|
||||
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id
|
||||
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false
|
||||
),
|
||||
pub_delayed as (
|
||||
select pub_id, max(delayed) as delayed
|
||||
from pub_pr
|
||||
group by pub_id
|
||||
)
|
||||
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true');
|
||||
|
||||
INSERT INTO ${stats_db_name}.publication_tmp
|
||||
SELECT substr(p.id, 4) as id,
|
||||
p.title[0].value as title,
|
||||
p.publisher.value as publisher,
|
||||
p.journal.name as journal,
|
||||
p.dateofacceptance.value as date,
|
||||
date_format(p.dateofacceptance.value, 'yyyy') as year,
|
||||
p.bestaccessright.classname as bestlicence,
|
||||
p.embargoenddate.value as embargo_end_date,
|
||||
false as delayed,
|
||||
size(p.author) as authors,
|
||||
concat_ws('\u003B', p.source.value) as source,
|
||||
case when size(p.description) > 0 then true else false end as abstract,
|
||||
select /*+ COALESCE(100) */
|
||||
substr(pub.id, 4) as id,
|
||||
pub.title[0].value as title,
|
||||
pub.publisher.value as publisher,
|
||||
pub.journal.name as journal,
|
||||
pub.dateofacceptance.value as date,
|
||||
date_format(pub.dateofacceptance.value, 'yyyy') as year,
|
||||
pub.bestaccessright.classname as bestlicence,
|
||||
pub.embargoenddate.value as embargo_end_date,
|
||||
coalesce(pub_delayed.delayed, false) as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects.
|
||||
size(pub.author) as authors,
|
||||
concat_ws('\u003B', pub.source.value) as source,
|
||||
case when size(pub.description) > 0 then true else false end as abstract,
|
||||
'publication' as type
|
||||
from ${openaire_db_name}.publication p
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
from ${openaire_db_name}.publication pub
|
||||
left outer join pub_delayed on pub.id=pub_delayed.pub_id
|
||||
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, instancetype.classname as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type
|
||||
from ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, case
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
|
||||
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
|
||||
from ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.context) contexts as context
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
|
||||
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
|
||||
|
@ -73,44 +73,44 @@ FROM (
|
|||
LEFT OUTER JOIN (
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
|
||||
select substr(p.id, 4) as id, p.language.classname as language
|
||||
select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language
|
||||
FROM ${openaire_db_name}.publication p
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, oids.ids AS oid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
|
||||
FROM ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.originalid) oids AS ids
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
|
||||
FROM ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.pid) pids AS ppid
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
|
||||
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
|
||||
select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
|
||||
FROM ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.subject) subjects AS subject
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
FROM ${openaire_db_name}.publication p
|
||||
lateral view explode(p.extrainfo) citations AS citation
|
||||
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
|
||||
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
|
@ -85,7 +87,13 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
||||
) )) foo;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
|
@ -256,7 +264,6 @@ create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * f
|
|||
|
||||
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue