Compare commits
362 Commits
affiliatio
...
main
Author | SHA1 | Date |
---|---|---|
Miriam Baglioni | ed560dacc0 | |
Miriam Baglioni | 07a1f2b31c | |
Claudio Atzori | 80d7b842e4 | |
Claudio Atzori | dd397d107d | |
Giambattista Bloisi | 3152382ae8 | |
Claudio Atzori | a50e04154e | |
Claudio Atzori | c4e8aaca1f | |
Claudio Atzori | 1596d70224 | |
Claudio Atzori | 5d030d1118 | |
Claudio Atzori | 6e0b6a886f | |
Claudio Atzori | 3854fcc5e0 | |
Miriam Baglioni | 371154d74f | |
Claudio Atzori | 4e9f64e01a | |
Giambattista Bloisi | d175a9745f | |
Michele De Bonis | fe70caa33c | |
Claudio Atzori | 81bfe3fe32 | |
Miriam Baglioni | 0765641979 | |
Miriam Baglioni | d0eba032cd | |
Miriam Baglioni | 7cd8171268 | |
Miriam Baglioni | a54d021c37 | |
Miriam Baglioni | 6eea075324 | |
Claudio Atzori | 2ba67f08d3 | |
Miriam Baglioni | df39360822 | |
Claudio Atzori | c1a309df75 | |
Claudio Atzori | 5fdc286eb9 | |
Claudio Atzori | e7f6eb82df | |
Claudio Atzori | 9c7711310e | |
Michele Artini | 0c66b8589d | |
Michele Artini | 758d4acd05 | |
Sandro La Bruzzo | 890190b7ae | |
Claudio Atzori | 24b5dc97c6 | |
Claudio Atzori | c648531ccb | |
Giambattista Bloisi | 10cad80d4d | |
Giambattista Bloisi | 37b9bdc10c | |
Giambattista Bloisi | e7150eea7b | |
Giambattista Bloisi | 23477f3e80 | |
Claudio Atzori | ce78752aa3 | |
Claudio Atzori | 152cb47375 | |
Miriam Baglioni | f1dc0050c7 | |
Miriam Baglioni | 42531afc3e | |
Miriam Baglioni | 907eeadce8 | |
Claudio Atzori | 6b4fa7b8b9 | |
Claudio Atzori | b8bc237079 | |
Claudio Atzori | ed6d71fc70 | |
Miriam Baglioni | cbe877b73c | |
Claudio Atzori | 5fc413a5df | |
Claudio Atzori | 97c9706469 | |
Claudio Atzori | 07e7b9315c | |
Alessia | 39810c6e7e | |
Claudio Atzori | e0f58afd30 | |
Claudio Atzori | 60cf7d86a1 | |
Miriam Baglioni | 8f11dfe554 | |
Claudio Atzori | d20a5e020a | |
Claudio Atzori | 3d1d8e6036 | |
Claudio Atzori | 0b1c58358b | |
Claudio Atzori | b70a440aca | |
Michele Artini | 36c3df1652 | |
Claudio Atzori | 2f13683285 | |
Claudio Atzori | 5ab409dcab | |
Claudio Atzori | b756cfeb85 | |
Claudio Atzori | 51d6a541bd | |
Claudio Atzori | 07ce92cef2 | |
Miriam Baglioni | f043b7b096 | |
Claudio Atzori | 153b56eeff | |
Claudio Atzori | ed97ba4565 | |
Claudio Atzori | 7b398a6d0b | |
Claudio Atzori | 13f6506ce5 | |
Claudio Atzori | 3d9ddaa23a | |
Claudio Atzori | c06dfdfd86 | |
Claudio Atzori | b822b34abe | |
Michele De Bonis | ea1841fbd2 | |
Miriam Baglioni | 4dbce39237 | |
Miriam Baglioni | 3ee8a7d18a | |
Claudio Atzori | ee7deb3f60 | |
Claudio Atzori | 157cc8be87 | |
Claudio Atzori | 023099a921 | |
Claudio Atzori | 786c217085 | |
Lampros Smyrnaios | c858c02111 | |
Claudio Atzori | 8220e27110 | |
Claudio Atzori | bc993d49c1 | |
Claudio Atzori | 1dc7458de2 | |
Claudio Atzori | a7a54aab47 | |
Miriam Baglioni | eaa00a4199 | |
Claudio Atzori | fb731b6d46 | |
Miriam Baglioni | b6da35e736 | |
Lampros Smyrnaios | 3c9b8de892 | |
Antonis Lempesis | c67ef157d3 | |
Lampros Smyrnaios | c23f3031ed | |
Claudio Atzori | 8ec151aa3d | |
Claudio Atzori | 2636936162 | |
Miriam Baglioni | ef437a8cdf | |
Miriam Baglioni | 86088ef26e | |
Miriam Baglioni | 143c525343 | |
Claudio Atzori | c371513d43 | |
Claudio Atzori | 71927ca818 | |
Giambattista Bloisi | 46018dc804 | |
Miriam Baglioni | 3efd5b1308 | |
Miriam Baglioni | 196fa55774 | |
Miriam Baglioni | 50805e3fc1 | |
Claudio Atzori | d39a1054b8 | |
Claudio Atzori | 576efc1857 | |
Claudio Atzori | efc1632e16 | |
Claudio Atzori | 91b49366c6 | |
Claudio Atzori | 5e05385d35 | |
Miriam Baglioni | c4d9b5b9d2 | |
Miriam Baglioni | bf9a5e6314 | |
Miriam Baglioni | 9d79ddb3dd | |
Miriam Baglioni | 907aa28c6c | |
Miriam Baglioni | 3955ceaa76 | |
Miriam Baglioni | 128c143394 | |
Claudio Atzori | 5133993ee5 | |
Claudio Atzori | 5cf259a851 | |
Claudio Atzori | e1828fc60e | |
Claudio Atzori | 56920b447d | |
Giambattista Bloisi | 3feab5d92d | |
Claudio Atzori | 6be783caec | |
Claudio Atzori | b703f94f09 | |
Miriam Baglioni | 14f275ffaf | |
Claudio Atzori | a428e7be7e | |
Claudio Atzori | 8e45c5baa8 | |
Claudio Atzori | db5e18c784 | |
Claudio Atzori | fb266efbcb | |
Claudio Atzori | d7daf54333 | |
Claudio Atzori | f99eaa0376 | |
Claudio Atzori | 23312fcc1e | |
Miriam Baglioni | b864f0adcf | |
Miriam Baglioni | 7a44869d87 | |
Miriam Baglioni | 12ffde023f | |
Claudio Atzori | c3fe59bc78 | |
Claudio Atzori | 795e1b2629 | |
Claudio Atzori | 0c05abe50b | |
Claudio Atzori | 8fdd0244ad | |
Claudio Atzori | 18fdaaf548 | |
Claudio Atzori | 43e123c624 | |
Claudio Atzori | 62a07b7add | |
Claudio Atzori | 96bddcc921 | |
Miriam Baglioni | 0486cea4c4 | |
Claudio Atzori | 013935c593 | |
Claudio Atzori | 6132bd028e | |
Miriam Baglioni | 519db1ddef | |
Claudio Atzori | 5add51f38c | |
Claudio Atzori | f01390702e | |
Claudio Atzori | 5592ccc37a | |
Claudio Atzori | d16c15da8d | |
Claudio Atzori | 09a6d17059 | |
Claudio Atzori | d70793847d | |
Michele De Bonis | f6601ea7d1 | |
Michele De Bonis | cd4c3c934d | |
Michele Artini | a99942f7cf | |
Michele Artini | 7f7083f53e | |
Michele Artini | d9b23a76c5 | |
Michele Artini | 841ca92246 | |
Michele Artini | 3bcfc40293 | |
Giambattista Bloisi | 3067ea390d | |
Miriam Baglioni | c94d94035c | |
Michele Artini | 4374d7449e | |
Claudio Atzori | 07d009007b | |
Claudio Atzori | 071d044971 | |
Claudio Atzori | b3ddbaed58 | |
Claudio Atzori | 1416f16b35 | |
Giambattista Bloisi | ba1a0e7b4f | |
Giambattista Bloisi | 079085286c | |
Giambattista Bloisi | 8dd666aedd | |
Claudio Atzori | f21133229a | |
Claudio Atzori | d86b909db2 | |
Claudio Atzori | 08162902ab | |
Claudio Atzori | e8630a6d03 | |
Claudio Atzori | f28c63d5ef | |
Claudio Atzori | 1a8b609ed2 | |
Miriam Baglioni | 4c8706efee | |
Claudio Atzori | 4d0c59669b | |
Sandro La Bruzzo | 3c8c88bdd3 | |
Claudio Atzori | 106968adaa | |
Claudio Atzori | a8a4db96f0 | |
Sandro La Bruzzo | 37e36baf76 | |
Sandro La Bruzzo | 9d39845d1f | |
Sandro La Bruzzo | 1fbd4325f5 | |
Sandro La Bruzzo | 1f1a6a5f5f | |
Claudio Atzori | c4ec35b6cd | |
Claudio Atzori | 1726f49790 | |
Claudio Atzori | 1763d377ad | |
Claudio Atzori | a0311e8a90 | |
Claudio Atzori | 8fb05888fd | |
Claudio Atzori | 2b626815ff | |
Miriam Baglioni | b177cd5a0a | |
Serafeim Chatzopoulos | 671ba8a5a7 | |
Claudio Atzori | 5f1ed61c1f | |
Claudio Atzori | 8c03c41d5d | |
Claudio Atzori | 97454e9594 | |
Serafeim Chatzopoulos | 7e34dde774 | |
Serafeim Chatzopoulos | 24c3f92d87 | |
Serafeim Chatzopoulos | 6ce9b600c1 | |
Serafeim Chatzopoulos | 94089878fd | |
Miriam Baglioni | 0097f4e64b | |
Miriam Baglioni | 5c5a195e97 | |
Miriam Baglioni | 70b78a40c7 | |
Miriam Baglioni | f206ff42d6 | |
Miriam Baglioni | 34358afe75 | |
Miriam Baglioni | 18bfff8af3 | |
Miriam Baglioni | 69dac91659 | |
Miriam Baglioni | a9ede1e989 | |
Claudio Atzori | 242d647146 | |
Claudio Atzori | af3ffad6c4 | |
Claudio Atzori | ba5475ed4c | |
Giambattista Bloisi | 2c235e82ad | |
Claudio Atzori | 4ac06c9e37 | |
Claudio Atzori | fa692b3629 | |
Claudio Atzori | ef02648399 | |
Claudio Atzori | d13bb534f0 | |
Giambattista Bloisi | 775c3f704a | |
Sandro La Bruzzo | 9c3ab11d5b | |
Sandro La Bruzzo | 423ef30676 | |
Giambattista Bloisi | 7152d47f84 | |
Claudio Atzori | 4853c19b5e | |
Giambattista Bloisi | 1f226d1dce | |
Alessia Bardi | 6186cdc2cc | |
Alessia Bardi | d94b9bebf7 | |
Alessia Bardi | 19abba8fa7 | |
Claudio Atzori | c2f179800c | |
Serafeim Chatzopoulos | 2aed5a74be | |
Claudio Atzori | 4dc4862011 | |
Claudio Atzori | dc80ab14d3 | |
Alessia Bardi | 77a2199837 | |
Claudio Atzori | 265180bfd2 | |
Claudio Atzori | da0e9828f7 | |
Miriam Baglioni | 599828ce35 | |
Claudio Atzori | 0bc74e2000 | |
Claudio Atzori | 7180911ded | |
Claudio Atzori | da1727f93f | |
Claudio Atzori | ccac6a7f75 | |
Claudio Atzori | d512df8612 | |
Claudio Atzori | 59764145bb | |
Miriam Baglioni | 9e8e39f78a | |
Claudio Atzori | 373a5f2c83 | |
Claudio Atzori | 8af129b0c7 | |
dimitrispie | 706092bc19 | |
dimitrispie | aedd279f78 | |
Miriam Baglioni | 8dcd028eed | |
Miriam Baglioni | 8621377917 | |
Miriam Baglioni | ef2dd7a980 | |
Claudio Atzori | f3a85e224b | |
Claudio Atzori | 4ef0f2ec26 | |
Claudio Atzori | 288ec0b7d6 | |
Claudio Atzori | 5f32edd9bf | |
Claudio Atzori | e10ce92fe5 | |
Claudio Atzori | b93e1541aa | |
Claudio Atzori | d029bf0b94 | |
Michele Artini | 009d7f312f | |
Miriam Baglioni | e4b27182d0 | |
Giambattista Bloisi | 758e662ab8 | |
Giambattista Bloisi | 485f9d18cb | |
Michele Artini | a92206dab5 | |
Miriam Baglioni | d9506035e4 | |
Alessia Bardi | 118e72d7db | |
Alessia Bardi | 5befd93d7d | |
Michele Artini | cae92cf811 | |
Miriam Baglioni | b64a5eb4a5 | |
Claudio Atzori | 654ffcba60 | |
Claudio Atzori | db625e548d | |
Alessia Bardi | 04141fe259 | |
Alessia Bardi | b88f009d9f | |
Alessia Bardi | 5ffe82ffd8 | |
Alessia Bardi | 1c173642f0 | |
Alessia Bardi | 382f46a8e4 | |
Miriam Baglioni | 9fc8ebe98b | |
Miriam Baglioni | 24c41806ac | |
Miriam Baglioni | 087b5a7973 | |
Claudio Atzori | 688e3b7936 | |
Claudio Atzori | 2e465915b4 | |
Claudio Atzori | 4a4ca634f0 | |
Miriam Baglioni | c6a7602b3e | |
Miriam Baglioni | 831055a1fc | |
Miriam Baglioni | cf3d0f4f83 | |
Claudio Atzori | 4f67225fbc | |
Claudio Atzori | e093f04874 | |
Miriam Baglioni | c5a9f39141 | |
Miriam Baglioni | ecc05fe0f3 | |
Claudio Atzori | 42442ccd39 | |
Miriam Baglioni | 9a9cc6a1dd | |
Michele Artini | 200098b683 | |
Michele Artini | 9c1df15071 | |
Miriam Baglioni | 32870339f5 | |
Miriam Baglioni | 7184cc0804 | |
Miriam Baglioni | 7473093c84 | |
Miriam Baglioni | 5f0906be60 | |
Claudio Atzori | 1b37516578 | |
Claudio Atzori | c1e2460293 | |
Claudio Atzori | 3800361033 | |
Michele Artini | 699736addc | |
Claudio Atzori | f86e19b282 | |
Michele Artini | d40e20f437 | |
Michele Artini | 4953ae5649 | |
Miriam Baglioni | c60d3a2b46 | |
Claudio Atzori | 7becdaf31d | |
Miriam Baglioni | b713132db7 | |
Miriam Baglioni | 11f2b470d3 | |
Sandro La Bruzzo | 91c70b15a5 | |
Claudio Atzori | f910b7379d | |
Claudio Atzori | 33bdad104e | |
Claudio Atzori | 5816ded93f | |
Claudio Atzori | 46972f8393 | |
Claudio Atzori | da85ca697d | |
Miriam Baglioni | 059e100ec7 | |
Miriam Baglioni | fc95a550c3 | |
Miriam Baglioni | 6901ac91b1 | |
Claudio Atzori | 08c4588d47 | |
Miriam Baglioni | 29d3da85f1 | |
Miriam Baglioni | 33a2b1b5dc | |
Miriam Baglioni | c6df8327b3 | |
Miriam Baglioni | 935aa367d8 | |
Miriam Baglioni | 43aedbdfe5 | |
Miriam Baglioni | b6da9b67ff | |
Claudio Atzori | a34c8b6f81 | |
Miriam Baglioni | 122e75aa17 | |
Miriam Baglioni | cee7a45b1d | |
Claudio Atzori | ed64618235 | |
Claudio Atzori | 8742934843 | |
Claudio Atzori | 13cc592f39 | |
Claudio Atzori | af15b1e48d | |
Claudio Atzori | eb45ba7af0 | |
Claudio Atzori | a929dc5fee | |
Miriam Baglioni | 5f9383b2d9 | |
Miriam Baglioni | b18bbca8af | |
dimitrispie | 55fa3b2a17 | |
Claudio Atzori | 80c5e0f637 | |
Claudio Atzori | c01d528ab2 | |
Claudio Atzori | e6d788d27a | |
Claudio Atzori | 930f118673 | |
Claudio Atzori | b2c3071e72 | |
Claudio Atzori | 10ec074f79 | |
Claudio Atzori | 7225fe9cbe | |
Miriam Baglioni | 869e129288 | |
Miriam Baglioni | 840465958b | |
Claudio Atzori | bdc8f993d0 | |
Miriam Baglioni | ec87149cb3 | |
Miriam Baglioni | b42e2c9df6 | |
Miriam Baglioni | 1329aa8479 | |
Miriam Baglioni | a0ee1a8640 | |
Claudio Atzori | 96062164f9 | |
Claudio Atzori | 35bb7c423f | |
Claudio Atzori | fd87571506 | |
Claudio Atzori | c527112e33 | |
Claudio Atzori | 65209359bc | |
Claudio Atzori | d72a64ded3 | |
Claudio Atzori | 3e8499ce47 | |
Claudio Atzori | 61aacb3271 | |
Claudio Atzori | dbb567251a | |
Claudio Atzori | c7e8ad853e | |
Claudio Atzori | 0849ebfd80 | |
Claudio Atzori | 281239249e | |
Claudio Atzori | 45fc5e12be | |
Claudio Atzori | 1c05aaaa2e | |
Claudio Atzori | 01d5ad6361 | |
Claudio Atzori | d872d1cdd9 | |
Claudio Atzori | ab0efecab4 | |
Claudio Atzori | 725c3c68d0 | |
Claudio Atzori | 300ae6221c | |
Claudio Atzori | 0ec2eaba35 | |
Claudio Atzori | a387807d43 | |
Claudio Atzori | 2abe2bc137 | |
Claudio Atzori | a07c876922 | |
Claudio Atzori | cbd48bc645 |
|
@ -7,12 +7,12 @@ import java.sql.*;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.slf4j.Logger;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
public class DbClient implements Closeable {
|
public class DbClient implements Closeable {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(DbClient.class);
|
private static final Logger log = LoggerFactory.getLogger(DbClient.class);
|
||||||
|
|
||||||
private final Connection connection;
|
private final Connection connection;
|
||||||
|
|
||||||
|
@ -37,6 +37,8 @@ public class DbClient implements Closeable {
|
||||||
try (final Statement stmt = connection.createStatement()) {
|
try (final Statement stmt = connection.createStatement()) {
|
||||||
stmt.setFetchSize(100);
|
stmt.setFetchSize(100);
|
||||||
|
|
||||||
|
log.info("running SQL:\n\n{}\n\n", sql);
|
||||||
|
|
||||||
try (final ResultSet rs = stmt.executeQuery(sql)) {
|
try (final ResultSet rs = stmt.executeQuery(sql)) {
|
||||||
while (rs.next()) {
|
while (rs.next()) {
|
||||||
consumer.accept(rs);
|
consumer.accept(rs);
|
||||||
|
|
|
@ -1,53 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
|
|
||||||
import okhttp3.MediaType;
|
|
||||||
import okhttp3.RequestBody;
|
|
||||||
import okhttp3.internal.Util;
|
|
||||||
import okio.BufferedSink;
|
|
||||||
import okio.Okio;
|
|
||||||
import okio.Source;
|
|
||||||
|
|
||||||
public class InputStreamRequestBody extends RequestBody {
|
|
||||||
|
|
||||||
private final InputStream inputStream;
|
|
||||||
private final MediaType mediaType;
|
|
||||||
private final long lenght;
|
|
||||||
|
|
||||||
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
|
|
||||||
|
|
||||||
return new InputStreamRequestBody(inputStream, mediaType, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
|
|
||||||
this.inputStream = inputStream;
|
|
||||||
this.mediaType = mediaType;
|
|
||||||
this.lenght = len;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public MediaType contentType() {
|
|
||||||
return mediaType;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long contentLength() {
|
|
||||||
|
|
||||||
return lenght;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void writeTo(BufferedSink sink) throws IOException {
|
|
||||||
Source source = null;
|
|
||||||
try {
|
|
||||||
source = Okio.source(inputStream);
|
|
||||||
sink.writeAll(source);
|
|
||||||
} finally {
|
|
||||||
Util.closeQuietly(source);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api;
|
|
||||||
|
|
||||||
public class MissingConceptDoiException extends Throwable {
|
|
||||||
public MissingConceptDoiException(String message) {
|
|
||||||
super(message);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,363 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.HttpURLConnection;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
import org.apache.http.HttpHeaders;
|
|
||||||
import org.apache.http.entity.ContentType;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
|
|
||||||
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
|
|
||||||
import okhttp3.*;
|
|
||||||
|
|
||||||
public class ZenodoAPIClient implements Serializable {
|
|
||||||
|
|
||||||
String urlString;
|
|
||||||
String bucket;
|
|
||||||
|
|
||||||
String deposition_id;
|
|
||||||
String access_token;
|
|
||||||
|
|
||||||
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
|
|
||||||
|
|
||||||
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
|
|
||||||
|
|
||||||
public String getUrlString() {
|
|
||||||
return urlString;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setUrlString(String urlString) {
|
|
||||||
this.urlString = urlString;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getBucket() {
|
|
||||||
return bucket;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setBucket(String bucket) {
|
|
||||||
this.bucket = bucket;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDeposition_id(String deposition_id) {
|
|
||||||
this.deposition_id = deposition_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ZenodoAPIClient(String urlString, String access_token) {
|
|
||||||
|
|
||||||
this.urlString = urlString;
|
|
||||||
this.access_token = access_token;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
|
|
||||||
*
|
|
||||||
* @return response code
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
public int newDeposition() throws IOException {
|
|
||||||
String json = "{}";
|
|
||||||
|
|
||||||
URL url = new URL(urlString);
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
||||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
|
||||||
conn.setRequestMethod("POST");
|
|
||||||
conn.setDoOutput(true);
|
|
||||||
try (OutputStream os = conn.getOutputStream()) {
|
|
||||||
byte[] input = json.getBytes("utf-8");
|
|
||||||
os.write(input, 0, input.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
String body = getBody(conn);
|
|
||||||
|
|
||||||
int responseCode = conn.getResponseCode();
|
|
||||||
conn.disconnect();
|
|
||||||
|
|
||||||
if (!checkOKStatus(responseCode))
|
|
||||||
throw new IOException("Unexpected code " + responseCode + body);
|
|
||||||
|
|
||||||
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
|
|
||||||
this.bucket = newSubmission.getLinks().getBucket();
|
|
||||||
this.deposition_id = newSubmission.getId();
|
|
||||||
|
|
||||||
return responseCode;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Upload files in Zenodo.
|
|
||||||
*
|
|
||||||
* @param is the inputStream for the file to upload
|
|
||||||
* @param file_name the name of the file as it will appear on Zenodo
|
|
||||||
* @return the response code
|
|
||||||
*/
|
|
||||||
public int uploadIS(InputStream is, String file_name) throws IOException {
|
|
||||||
|
|
||||||
URL url = new URL(bucket + "/" + file_name);
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
||||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
|
||||||
conn.setDoOutput(true);
|
|
||||||
conn.setRequestMethod("PUT");
|
|
||||||
|
|
||||||
byte[] buf = new byte[8192];
|
|
||||||
int length;
|
|
||||||
try (OutputStream os = conn.getOutputStream()) {
|
|
||||||
while ((length = is.read(buf)) != -1) {
|
|
||||||
os.write(buf, 0, length);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
int responseCode = conn.getResponseCode();
|
|
||||||
if (!checkOKStatus(responseCode)) {
|
|
||||||
throw new IOException("Unexpected code " + responseCode + getBody(conn));
|
|
||||||
}
|
|
||||||
|
|
||||||
return responseCode;
|
|
||||||
}
|
|
||||||
|
|
||||||
@NotNull
|
|
||||||
private String getBody(HttpURLConnection conn) throws IOException {
|
|
||||||
String body = "{}";
|
|
||||||
try (BufferedReader br = new BufferedReader(
|
|
||||||
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
|
|
||||||
StringBuilder response = new StringBuilder();
|
|
||||||
String responseLine = null;
|
|
||||||
while ((responseLine = br.readLine()) != null) {
|
|
||||||
response.append(responseLine.trim());
|
|
||||||
}
|
|
||||||
|
|
||||||
body = response.toString();
|
|
||||||
|
|
||||||
}
|
|
||||||
return body;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Associates metadata information to the current deposition
|
|
||||||
*
|
|
||||||
* @param metadata the metadata
|
|
||||||
* @return response code
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
public int sendMretadata(String metadata) throws IOException {
|
|
||||||
|
|
||||||
URL url = new URL(urlString + "/" + deposition_id);
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
||||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
|
||||||
conn.setDoOutput(true);
|
|
||||||
conn.setRequestMethod("PUT");
|
|
||||||
|
|
||||||
try (OutputStream os = conn.getOutputStream()) {
|
|
||||||
byte[] input = metadata.getBytes("utf-8");
|
|
||||||
os.write(input, 0, input.length);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
final int responseCode = conn.getResponseCode();
|
|
||||||
conn.disconnect();
|
|
||||||
if (!checkOKStatus(responseCode))
|
|
||||||
throw new IOException("Unexpected code " + responseCode + getBody(conn));
|
|
||||||
|
|
||||||
return responseCode;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean checkOKStatus(int responseCode) {
|
|
||||||
|
|
||||||
if (HttpURLConnection.HTTP_OK != responseCode ||
|
|
||||||
HttpURLConnection.HTTP_CREATED != responseCode)
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* To publish the current deposition. It works for both new deposition or new version of an old deposition
|
|
||||||
*
|
|
||||||
* @return response code
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public int publish() throws IOException {
|
|
||||||
|
|
||||||
String json = "{}";
|
|
||||||
|
|
||||||
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
|
|
||||||
|
|
||||||
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
|
|
||||||
|
|
||||||
Request request = new Request.Builder()
|
|
||||||
.url(urlString + "/" + deposition_id + "/actions/publish")
|
|
||||||
.addHeader("Authorization", "Bearer " + access_token)
|
|
||||||
.post(body)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
try (Response response = httpClient.newCall(request).execute()) {
|
|
||||||
|
|
||||||
if (!response.isSuccessful())
|
|
||||||
throw new IOException("Unexpected code " + response + response.body().string());
|
|
||||||
|
|
||||||
return response.code();
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
|
|
||||||
* for the new version.
|
|
||||||
*
|
|
||||||
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
|
|
||||||
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
|
||||||
* concept_rec_id = 656930
|
|
||||||
* @return response code
|
|
||||||
*/
|
|
||||||
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
|
||||||
setDepositionId(concept_rec_id, 1);
|
|
||||||
String json = "{}";
|
|
||||||
|
|
||||||
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
||||||
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
|
||||||
conn.setDoOutput(true);
|
|
||||||
conn.setRequestMethod("POST");
|
|
||||||
|
|
||||||
try (OutputStream os = conn.getOutputStream()) {
|
|
||||||
byte[] input = json.getBytes("utf-8");
|
|
||||||
os.write(input, 0, input.length);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
String body = getBody(conn);
|
|
||||||
|
|
||||||
int responseCode = conn.getResponseCode();
|
|
||||||
|
|
||||||
conn.disconnect();
|
|
||||||
if (!checkOKStatus(responseCode))
|
|
||||||
throw new IOException("Unexpected code " + responseCode + body);
|
|
||||||
|
|
||||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
|
||||||
String latest_draft = zenodoModel.getLinks().getLatest_draft();
|
|
||||||
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
|
|
||||||
bucket = getBucket(latest_draft);
|
|
||||||
|
|
||||||
return responseCode;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* To finish uploading a version or new deposition not published
|
|
||||||
* It sets the deposition_id and the bucket to be used
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* @param deposition_id the deposition id of the not yet published upload
|
|
||||||
* concept_rec_id = 656930
|
|
||||||
* @return response code
|
|
||||||
* @throws IOException
|
|
||||||
* @throws MissingConceptDoiException
|
|
||||||
*/
|
|
||||||
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
|
|
||||||
|
|
||||||
this.deposition_id = deposition_id;
|
|
||||||
|
|
||||||
String json = "{}";
|
|
||||||
|
|
||||||
URL url = new URL(urlString + "/" + deposition_id);
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
||||||
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
|
||||||
conn.setRequestMethod("POST");
|
|
||||||
conn.setDoOutput(true);
|
|
||||||
try (OutputStream os = conn.getOutputStream()) {
|
|
||||||
byte[] input = json.getBytes("utf-8");
|
|
||||||
os.write(input, 0, input.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
String body = getBody(conn);
|
|
||||||
|
|
||||||
int responseCode = conn.getResponseCode();
|
|
||||||
conn.disconnect();
|
|
||||||
|
|
||||||
if (!checkOKStatus(responseCode))
|
|
||||||
throw new IOException("Unexpected code " + responseCode + body);
|
|
||||||
|
|
||||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
|
||||||
bucket = zenodoModel.getLinks().getBucket();
|
|
||||||
|
|
||||||
return responseCode;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
|
|
||||||
|
|
||||||
ZenodoModelList zenodoModelList = new Gson()
|
|
||||||
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
|
|
||||||
|
|
||||||
for (ZenodoModel zm : zenodoModelList) {
|
|
||||||
if (zm.getConceptrecid().equals(concept_rec_id)) {
|
|
||||||
deposition_id = zm.getId();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (zenodoModelList.size() == 0)
|
|
||||||
throw new MissingConceptDoiException(
|
|
||||||
"The concept record id specified was missing in the list of depositions");
|
|
||||||
setDepositionId(concept_rec_id, page + 1);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getPrevDepositions(String page) throws IOException {
|
|
||||||
|
|
||||||
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
|
|
||||||
urlBuilder.addQueryParameter("page", page);
|
|
||||||
|
|
||||||
URL url = new URL(urlBuilder.build().toString());
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
||||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
|
||||||
conn.setDoOutput(true);
|
|
||||||
conn.setRequestMethod("GET");
|
|
||||||
|
|
||||||
String body = getBody(conn);
|
|
||||||
|
|
||||||
int responseCode = conn.getResponseCode();
|
|
||||||
|
|
||||||
conn.disconnect();
|
|
||||||
if (!checkOKStatus(responseCode))
|
|
||||||
throw new IOException("Unexpected code " + responseCode + body);
|
|
||||||
|
|
||||||
return body;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getBucket(String inputUurl) throws IOException {
|
|
||||||
|
|
||||||
URL url = new URL(inputUurl);
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
||||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
|
||||||
conn.setDoOutput(true);
|
|
||||||
conn.setRequestMethod("GET");
|
|
||||||
|
|
||||||
String body = getBody(conn);
|
|
||||||
|
|
||||||
int responseCode = conn.getResponseCode();
|
|
||||||
|
|
||||||
conn.disconnect();
|
|
||||||
if (!checkOKStatus(responseCode))
|
|
||||||
throw new IOException("Unexpected code " + responseCode + body);
|
|
||||||
|
|
||||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
|
||||||
|
|
||||||
return zenodoModel.getLinks().getBucket();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,14 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
public class Community {
|
|
||||||
private String identifier;
|
|
||||||
|
|
||||||
public String getIdentifier() {
|
|
||||||
return identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setIdentifier(String identifier) {
|
|
||||||
this.identifier = identifier;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,47 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
public class Creator {
|
|
||||||
private String affiliation;
|
|
||||||
private String name;
|
|
||||||
private String orcid;
|
|
||||||
|
|
||||||
public String getAffiliation() {
|
|
||||||
return affiliation;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setAffiliation(String affiliation) {
|
|
||||||
this.affiliation = affiliation;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getName() {
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setName(String name) {
|
|
||||||
this.name = name;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getOrcid() {
|
|
||||||
return orcid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setOrcid(String orcid) {
|
|
||||||
this.orcid = orcid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Creator newInstance(String name, String affiliation, String orcid) {
|
|
||||||
Creator c = new Creator();
|
|
||||||
if (name != null) {
|
|
||||||
c.name = name;
|
|
||||||
}
|
|
||||||
if (affiliation != null) {
|
|
||||||
c.affiliation = affiliation;
|
|
||||||
}
|
|
||||||
if (orcid != null) {
|
|
||||||
c.orcid = orcid;
|
|
||||||
}
|
|
||||||
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,44 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class File implements Serializable {
|
|
||||||
private String checksum;
|
|
||||||
private String filename;
|
|
||||||
private long filesize;
|
|
||||||
private String id;
|
|
||||||
|
|
||||||
public String getChecksum() {
|
|
||||||
return checksum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setChecksum(String checksum) {
|
|
||||||
this.checksum = checksum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getFilename() {
|
|
||||||
return filename;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFilename(String filename) {
|
|
||||||
this.filename = filename;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getFilesize() {
|
|
||||||
return filesize;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFilesize(long filesize) {
|
|
||||||
this.filesize = filesize;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getId() {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setId(String id) {
|
|
||||||
this.id = id;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,23 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class Grant implements Serializable {
|
|
||||||
private String id;
|
|
||||||
|
|
||||||
public String getId() {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setId(String id) {
|
|
||||||
this.id = id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Grant newInstance(String id) {
|
|
||||||
Grant g = new Grant();
|
|
||||||
g.id = id;
|
|
||||||
|
|
||||||
return g;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,92 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class Links implements Serializable {
|
|
||||||
|
|
||||||
private String bucket;
|
|
||||||
|
|
||||||
private String discard;
|
|
||||||
|
|
||||||
private String edit;
|
|
||||||
private String files;
|
|
||||||
private String html;
|
|
||||||
private String latest_draft;
|
|
||||||
private String latest_draft_html;
|
|
||||||
private String publish;
|
|
||||||
|
|
||||||
private String self;
|
|
||||||
|
|
||||||
public String getBucket() {
|
|
||||||
return bucket;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setBucket(String bucket) {
|
|
||||||
this.bucket = bucket;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDiscard() {
|
|
||||||
return discard;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDiscard(String discard) {
|
|
||||||
this.discard = discard;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getEdit() {
|
|
||||||
return edit;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setEdit(String edit) {
|
|
||||||
this.edit = edit;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getFiles() {
|
|
||||||
return files;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFiles(String files) {
|
|
||||||
this.files = files;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getHtml() {
|
|
||||||
return html;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setHtml(String html) {
|
|
||||||
this.html = html;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLatest_draft() {
|
|
||||||
return latest_draft;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLatest_draft(String latest_draft) {
|
|
||||||
this.latest_draft = latest_draft;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLatest_draft_html() {
|
|
||||||
return latest_draft_html;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLatest_draft_html(String latest_draft_html) {
|
|
||||||
this.latest_draft_html = latest_draft_html;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getPublish() {
|
|
||||||
return publish;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setPublish(String publish) {
|
|
||||||
this.publish = publish;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getSelf() {
|
|
||||||
return self;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSelf(String self) {
|
|
||||||
this.self = self;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,153 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class Metadata implements Serializable {
|
|
||||||
|
|
||||||
private String access_right;
|
|
||||||
private List<Community> communities;
|
|
||||||
private List<Creator> creators;
|
|
||||||
private String description;
|
|
||||||
private String doi;
|
|
||||||
private List<Grant> grants;
|
|
||||||
private List<String> keywords;
|
|
||||||
private String language;
|
|
||||||
private String license;
|
|
||||||
private PrereserveDoi prereserve_doi;
|
|
||||||
private String publication_date;
|
|
||||||
private List<String> references;
|
|
||||||
private List<RelatedIdentifier> related_identifiers;
|
|
||||||
private String title;
|
|
||||||
private String upload_type;
|
|
||||||
private String version;
|
|
||||||
|
|
||||||
public String getUpload_type() {
|
|
||||||
return upload_type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setUpload_type(String upload_type) {
|
|
||||||
this.upload_type = upload_type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getVersion() {
|
|
||||||
return version;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setVersion(String version) {
|
|
||||||
this.version = version;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getAccess_right() {
|
|
||||||
return access_right;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setAccess_right(String access_right) {
|
|
||||||
this.access_right = access_right;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Community> getCommunities() {
|
|
||||||
return communities;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCommunities(List<Community> communities) {
|
|
||||||
this.communities = communities;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Creator> getCreators() {
|
|
||||||
return creators;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCreators(List<Creator> creators) {
|
|
||||||
this.creators = creators;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDescription() {
|
|
||||||
return description;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDescription(String description) {
|
|
||||||
this.description = description;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDoi() {
|
|
||||||
return doi;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDoi(String doi) {
|
|
||||||
this.doi = doi;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Grant> getGrants() {
|
|
||||||
return grants;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setGrants(List<Grant> grants) {
|
|
||||||
this.grants = grants;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getKeywords() {
|
|
||||||
return keywords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setKeywords(List<String> keywords) {
|
|
||||||
this.keywords = keywords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLanguage() {
|
|
||||||
return language;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLanguage(String language) {
|
|
||||||
this.language = language;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLicense() {
|
|
||||||
return license;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLicense(String license) {
|
|
||||||
this.license = license;
|
|
||||||
}
|
|
||||||
|
|
||||||
public PrereserveDoi getPrereserve_doi() {
|
|
||||||
return prereserve_doi;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
|
|
||||||
this.prereserve_doi = prereserve_doi;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getPublication_date() {
|
|
||||||
return publication_date;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setPublication_date(String publication_date) {
|
|
||||||
this.publication_date = publication_date;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getReferences() {
|
|
||||||
return references;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setReferences(List<String> references) {
|
|
||||||
this.references = references;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<RelatedIdentifier> getRelated_identifiers() {
|
|
||||||
return related_identifiers;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
|
|
||||||
this.related_identifiers = related_identifiers;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTitle() {
|
|
||||||
return title;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTitle(String title) {
|
|
||||||
this.title = title;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,25 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class PrereserveDoi implements Serializable {
|
|
||||||
private String doi;
|
|
||||||
private String recid;
|
|
||||||
|
|
||||||
public String getDoi() {
|
|
||||||
return doi;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDoi(String doi) {
|
|
||||||
this.doi = doi;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getRecid() {
|
|
||||||
return recid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setRecid(String recid) {
|
|
||||||
this.recid = recid;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,43 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class RelatedIdentifier implements Serializable {
|
|
||||||
private String identifier;
|
|
||||||
private String relation;
|
|
||||||
private String resource_type;
|
|
||||||
private String scheme;
|
|
||||||
|
|
||||||
public String getIdentifier() {
|
|
||||||
return identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setIdentifier(String identifier) {
|
|
||||||
this.identifier = identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getRelation() {
|
|
||||||
return relation;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setRelation(String relation) {
|
|
||||||
this.relation = relation;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getResource_type() {
|
|
||||||
return resource_type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setResource_type(String resource_type) {
|
|
||||||
this.resource_type = resource_type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getScheme() {
|
|
||||||
return scheme;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setScheme(String scheme) {
|
|
||||||
this.scheme = scheme;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,118 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class ZenodoModel implements Serializable {
|
|
||||||
|
|
||||||
private String conceptrecid;
|
|
||||||
private String created;
|
|
||||||
|
|
||||||
private List<File> files;
|
|
||||||
private String id;
|
|
||||||
private Links links;
|
|
||||||
private Metadata metadata;
|
|
||||||
private String modified;
|
|
||||||
private String owner;
|
|
||||||
private String record_id;
|
|
||||||
private String state;
|
|
||||||
private boolean submitted;
|
|
||||||
private String title;
|
|
||||||
|
|
||||||
public String getConceptrecid() {
|
|
||||||
return conceptrecid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setConceptrecid(String conceptrecid) {
|
|
||||||
this.conceptrecid = conceptrecid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCreated() {
|
|
||||||
return created;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCreated(String created) {
|
|
||||||
this.created = created;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<File> getFiles() {
|
|
||||||
return files;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFiles(List<File> files) {
|
|
||||||
this.files = files;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getId() {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setId(String id) {
|
|
||||||
this.id = id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Links getLinks() {
|
|
||||||
return links;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLinks(Links links) {
|
|
||||||
this.links = links;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Metadata getMetadata() {
|
|
||||||
return metadata;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMetadata(Metadata metadata) {
|
|
||||||
this.metadata = metadata;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getModified() {
|
|
||||||
return modified;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setModified(String modified) {
|
|
||||||
this.modified = modified;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getOwner() {
|
|
||||||
return owner;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setOwner(String owner) {
|
|
||||||
this.owner = owner;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getRecord_id() {
|
|
||||||
return record_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setRecord_id(String record_id) {
|
|
||||||
this.record_id = record_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getState() {
|
|
||||||
return state;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setState(String state) {
|
|
||||||
this.state = state;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSubmitted() {
|
|
||||||
return submitted;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSubmitted(boolean submitted) {
|
|
||||||
this.submitted = submitted;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTitle() {
|
|
||||||
return title;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTitle(String title) {
|
|
||||||
this.title = title;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,7 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api.zenodo;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
public class ZenodoModelList extends ArrayList<ZenodoModel> {
|
|
||||||
}
|
|
|
@ -65,13 +65,7 @@ public class RunSQLSparkJob {
|
||||||
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
|
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
|
||||||
log.info("executing: {}", statement);
|
log.info("executing: {}", statement);
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
try {
|
spark.sql(statement).show();
|
||||||
spark.sql(statement).show();
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("Error executing statement: {}", statement, e);
|
|
||||||
System.err.println("Error executing statement: " + statement + "\n" + e);
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
log
|
log
|
||||||
.info(
|
.info(
|
||||||
"executed in {}",
|
"executed in {}",
|
||||||
|
|
|
@ -119,7 +119,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.getContext()
|
.getContext()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
|
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toCollection(ArrayList::new)));
|
||||||
}
|
}
|
||||||
return (T) res;
|
return (T) res;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1015,4 +1015,41 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implements bad and ugly things that we should get rid of ASAP.
|
||||||
|
*
|
||||||
|
* @param value
|
||||||
|
* @return
|
||||||
|
* @param <T>
|
||||||
|
*/
|
||||||
|
public static <T extends Oaf> T dedicatedUglyHacks(T value) {
|
||||||
|
if (value instanceof OafEntity) {
|
||||||
|
if (value instanceof Result) {
|
||||||
|
final Result r = (Result) value;
|
||||||
|
|
||||||
|
// Fix for AMS Acta
|
||||||
|
Optional
|
||||||
|
.ofNullable(r.getInstance())
|
||||||
|
.map(
|
||||||
|
instance -> instance
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
i -> Optional
|
||||||
|
.ofNullable(i.getHostedby())
|
||||||
|
.map(KeyValue::getKey)
|
||||||
|
.map(dsId -> dsId.equals("10|re3data_____::4cc76bed7ce2fb95fd8e7a2dfde16016"))
|
||||||
|
.orElse(false)))
|
||||||
|
.ifPresent(instance -> instance.forEach(i -> {
|
||||||
|
if (Optional
|
||||||
|
.ofNullable(i.getPid())
|
||||||
|
.map(pid -> pid.stream().noneMatch(p -> p.getValue().startsWith("10.6092/unibo/amsacta")))
|
||||||
|
.orElse(false)) {
|
||||||
|
i.setHostedby(UNKNOWN_REPOSITORY);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -433,7 +433,10 @@ public class MergeUtils {
|
||||||
|
|
||||||
// merge datainfo for same context id
|
// merge datainfo for same context id
|
||||||
merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
|
merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
|
||||||
r.getDataInfo().addAll(l.getDataInfo());
|
ArrayList<DataInfo> di = new ArrayList<>();
|
||||||
|
di.addAll(r.getDataInfo());
|
||||||
|
di.addAll(l.getDataInfo());
|
||||||
|
r.setDataInfo(di);
|
||||||
return r;
|
return r;
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
|
|
@ -1,109 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common.api;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
@Disabled
|
|
||||||
class ZenodoAPIClientTest {
|
|
||||||
|
|
||||||
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
|
|
||||||
private final String ACCESS_TOKEN = "";
|
|
||||||
|
|
||||||
private final String CONCEPT_REC_ID = "657113";
|
|
||||||
|
|
||||||
private final String depositionId = "674915";
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
|
|
||||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
|
||||||
ACCESS_TOKEN);
|
|
||||||
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
|
|
||||||
|
|
||||||
File file = new File(getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
|
|
||||||
.getPath());
|
|
||||||
|
|
||||||
InputStream is = new FileInputStream(file);
|
|
||||||
|
|
||||||
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
|
|
||||||
|
|
||||||
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
|
|
||||||
|
|
||||||
Assertions.assertEquals(200, client.sendMretadata(metadata));
|
|
||||||
|
|
||||||
Assertions.assertEquals(202, client.publish());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testNewDeposition() throws IOException {
|
|
||||||
|
|
||||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
|
||||||
ACCESS_TOKEN);
|
|
||||||
Assertions.assertEquals(201, client.newDeposition());
|
|
||||||
|
|
||||||
File file = new File(getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
|
|
||||||
.getPath());
|
|
||||||
|
|
||||||
InputStream is = new FileInputStream(file);
|
|
||||||
|
|
||||||
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
|
|
||||||
|
|
||||||
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
|
|
||||||
|
|
||||||
Assertions.assertEquals(200, client.sendMretadata(metadata));
|
|
||||||
|
|
||||||
Assertions.assertEquals(202, client.publish());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
|
|
||||||
|
|
||||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
|
||||||
ACCESS_TOKEN);
|
|
||||||
|
|
||||||
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
|
|
||||||
|
|
||||||
File file = new File(getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/common/api/newVersion")
|
|
||||||
.getPath());
|
|
||||||
|
|
||||||
InputStream is = new FileInputStream(file);
|
|
||||||
|
|
||||||
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
|
|
||||||
|
|
||||||
Assertions.assertEquals(202, client.publish());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
|
|
||||||
|
|
||||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
|
||||||
ACCESS_TOKEN);
|
|
||||||
|
|
||||||
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
|
|
||||||
|
|
||||||
File file = new File(getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/common/api/newVersion2")
|
|
||||||
.getPath());
|
|
||||||
|
|
||||||
InputStream is = new FileInputStream(file);
|
|
||||||
|
|
||||||
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
|
|
||||||
|
|
||||||
Assertions.assertEquals(202, client.publish());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -177,7 +177,7 @@ class OafMapperUtilsTest {
|
||||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||||
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID,
|
||||||
((Result) MergeUtils
|
((Result) MergeUtils
|
||||||
.merge(p2, d1))
|
.merge(p2, d1))
|
||||||
.getResulttype()
|
.getResulttype()
|
||||||
|
|
|
@ -6,18 +6,7 @@
|
||||||
<artifactId>dhp-workflows</artifactId>
|
<artifactId>dhp-workflows</artifactId>
|
||||||
<version>1.2.5-SNAPSHOT</version>
|
<version>1.2.5-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>dhp-aggregation</artifactId>
|
<artifactId>dhp-aggregation</artifactId>
|
||||||
|
|
||||||
<properties>
|
|
||||||
<affro.release.version>1.0.0</affro.release.version>
|
|
||||||
</properties>
|
|
||||||
|
|
||||||
<scm>
|
|
||||||
<url>https://code-repo.d4science.org/mkallipo/affRo</url>
|
|
||||||
<connection>scm:git:https://code-repo.d4science.org/mkallipo/affRo.git</connection>
|
|
||||||
</scm>
|
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
|
@ -54,32 +43,6 @@
|
||||||
<scalaVersion>${scala.version}</scalaVersion>
|
<scalaVersion>${scala.version}</scalaVersion>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-scm-plugin</artifactId>
|
|
||||||
<version>1.8.1</version>
|
|
||||||
<configuration>
|
|
||||||
<connectionType>connection</connectionType>
|
|
||||||
<!--
|
|
||||||
<scmVersionType>tag</scmVersionType>--><!-- 'branch' can also be provided here -->
|
|
||||||
<!-- <scmVersion>${affro.release.version}</scmVersion>--><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
|
|
||||||
|
|
||||||
<scmVersionType>branch</scmVersionType><!-- 'branch' can also be provided here -->
|
|
||||||
<scmVersion>openaire-workflow-ready</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
|
|
||||||
<checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/affRo</checkoutDirectory>
|
|
||||||
</configuration>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<id>checkout-affro</id>
|
|
||||||
<phase>prepare-package</phase>
|
|
||||||
<goals>
|
|
||||||
<goal>checkout</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
|
|
||||||
</plugins>
|
</plugins>
|
||||||
|
|
||||||
</build>
|
</build>
|
||||||
|
|
|
@ -46,9 +46,6 @@ public class GetOpenCitationsRefs implements Serializable {
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath {}", outputPath);
|
log.info("outputPath {}", outputPath);
|
||||||
|
|
||||||
final String backupPath = parser.get("backupPath");
|
|
||||||
log.info("backupPath {}", backupPath);
|
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
conf.set("fs.defaultFS", hdfsNameNode);
|
conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
|
|
||||||
|
@ -56,11 +53,11 @@ public class GetOpenCitationsRefs implements Serializable {
|
||||||
|
|
||||||
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
||||||
|
|
||||||
ocr.doExtract(inputPath, outputPath, backupPath, fileSystem);
|
ocr.doExtract(inputPath, outputPath, fileSystem);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doExtract(String inputPath, String outputPath, String backupPath, FileSystem fileSystem)
|
private void doExtract(String inputPath, String outputPath, FileSystem fileSystem)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||||
|
@ -92,7 +89,6 @@ public class GetOpenCitationsRefs implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,6 +49,9 @@ public class ReadCOCI implements Serializable {
|
||||||
final String workingPath = parser.get("inputPath");
|
final String workingPath = parser.get("inputPath");
|
||||||
log.info("workingPath {}", workingPath);
|
log.info("workingPath {}", workingPath);
|
||||||
|
|
||||||
|
final String backupPath = parser.get("backupPath");
|
||||||
|
log.info("backupPath {}", backupPath);
|
||||||
|
|
||||||
SparkConf sconf = new SparkConf();
|
SparkConf sconf = new SparkConf();
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
|
@ -68,12 +71,14 @@ public class ReadCOCI implements Serializable {
|
||||||
workingPath,
|
workingPath,
|
||||||
fileSystem,
|
fileSystem,
|
||||||
outputPath,
|
outputPath,
|
||||||
|
backupPath,
|
||||||
delimiter);
|
delimiter);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
||||||
String outputPath,
|
String outputPath,
|
||||||
|
String backupPath,
|
||||||
String delimiter) throws IOException {
|
String delimiter) throws IOException {
|
||||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||||
.listFiles(
|
.listFiles(
|
||||||
|
@ -108,7 +113,7 @@ public class ReadCOCI implements Serializable {
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
|
|
||||||
fileSystem.delete(fileStatus.getPath());
|
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -297,7 +297,7 @@ public class ExtractPerson implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Relation getAffiliationRelation(Employment row) {
|
private static Relation getAffiliationRelation(Employment row) {
|
||||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(row.getOrcid());
|
String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
|
||||||
String target = ROR_PREFIX
|
String target = ROR_PREFIX
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
|
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
|
||||||
List<KeyValue> properties = new ArrayList<>();
|
List<KeyValue> properties = new ArrayList<>();
|
||||||
|
|
|
@ -1,45 +0,0 @@
|
||||||
# --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties ---
|
|
||||||
# dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
|
|
||||||
# dhp.hadoop.frontend.user.name=ilias.kanellos
|
|
||||||
# dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
|
|
||||||
# dhp.hadoop.frontend.port.ssh=22
|
|
||||||
# oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
|
|
||||||
# jobTracker=yarnRM
|
|
||||||
# nameNode=hdfs://nameservice1
|
|
||||||
# oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
|
|
||||||
# maven.executable=mvn
|
|
||||||
|
|
||||||
|
|
||||||
# The above is given differently in an example I found online
|
|
||||||
oozie.action.sharelib.for.spark=spark2
|
|
||||||
oozieActionShareLibForSpark2=spark2
|
|
||||||
spark2YarnHistoryServerAddress=http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
|
|
||||||
spark2EventLogDir=/user/spark/spark2ApplicationHistory
|
|
||||||
sparkSqlWarehouseDir=/user/hive/warehouse
|
|
||||||
#hiveMetastoreUris=thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
|
|
||||||
# This MAY avoid the no library used error
|
|
||||||
oozie.use.system.libpath=true
|
|
||||||
# Some stuff copied from openaire's jobs
|
|
||||||
spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
|
|
||||||
spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
|
|
||||||
|
|
||||||
# The following is needed as a property of a workflow
|
|
||||||
wfAppPath=${oozieTopWfApplicationPath}
|
|
||||||
|
|
||||||
resumeFrom=Crossref
|
|
||||||
|
|
||||||
#OpenAlex input/output
|
|
||||||
#resultFolder=/tmp/affro-results/oalex
|
|
||||||
#inputFolder=/user/zeppelin/affiliations/raw_aff_string/2024-08
|
|
||||||
|
|
||||||
#Crossref input/output
|
|
||||||
resultFolder=/tmp/affro-results/crossref
|
|
||||||
inputFolder=/data/doiboost/crossref/crossref_unpack
|
|
||||||
|
|
||||||
#
|
|
||||||
#crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
|
||||||
#pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
|
||||||
#openapcInputPath=/data/bip-affiliations/openapc-data.json
|
|
||||||
#dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
|
||||||
#
|
|
||||||
#outputPath=/tmp/crossref-affiliations-output-v5
|
|
|
@ -1,30 +0,0 @@
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>jobTracker</name>
|
|
||||||
<value>yarnRM</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nameNode</name>
|
|
||||||
<value>hdfs://nameservice1</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.use.system.libpath</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>hiveMetastoreUris</name>
|
|
||||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>hiveJdbcUrl</name>
|
|
||||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>hiveDbName</name>
|
|
||||||
<value>openaire</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
|
@ -1,176 +0,0 @@
|
||||||
<workflow-app name="AffroAffiliations" xmlns="uri:oozie:workflow:0.5">
|
|
||||||
<parameters>
|
|
||||||
<property>
|
|
||||||
<name>sparkDriverMemory</name>
|
|
||||||
<description>memory for driver process</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorMemory</name>
|
|
||||||
<description>memory for individual executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozieActionShareLibForSpark2</name>
|
|
||||||
<description>oozie action sharelib for spark 2.*</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2ExtraListeners</name>
|
|
||||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
|
||||||
<description>spark 2.* extra listeners classname</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2SqlQueryExecutionListeners</name>
|
|
||||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
|
||||||
<description>spark 2.* sql query execution listeners classname</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2YarnHistoryServerAddress</name>
|
|
||||||
<description>spark 2.* yarn history server address</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2EventLogDir</name>
|
|
||||||
<description>spark 2.* event log dir location</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
|
||||||
|
|
||||||
<global>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>mapreduce.job.queuename</name>
|
|
||||||
<value>${queueName}</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
|
||||||
<value>${oozieLauncherQueueName}</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>${oozieActionShareLibForSpark2}</value>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
</configuration>
|
|
||||||
</global>
|
|
||||||
|
|
||||||
<start to="resumeFrom"/>
|
|
||||||
|
|
||||||
<kill name="Kill">
|
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
<decision name="resumeFrom">
|
|
||||||
<switch>
|
|
||||||
<case to="run-affro-on-iisdata">${wf:conf('resumeFrom') eq 'IIS'}</case>
|
|
||||||
<case to="run-affro-on-crossref">${wf:conf('resumeFrom') eq 'Crossref'}</case>
|
|
||||||
<default to="run-affro-on-oalexstrings"/>
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
<action name="run-affro-on-iisdata">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Affiliations inference (Affro)</name>
|
|
||||||
<jar>update_records.py</jar>
|
|
||||||
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=4
|
|
||||||
--executor-memory=6G
|
|
||||||
--driver-memory=15G
|
|
||||||
--conf spark.executor.memoryOverhead=6G
|
|
||||||
--conf spark.sql.shuffle.partitions=20000
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
|
||||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
|
||||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/affro_test_example.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
|
||||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
|
||||||
</spark-opts>
|
|
||||||
|
|
||||||
<arg>${resultFolder}</arg>
|
|
||||||
|
|
||||||
<file>${wfAppPath}/affRo/update_records.py#update_records.py</file>
|
|
||||||
</spark>
|
|
||||||
|
|
||||||
<ok to="End" />
|
|
||||||
<error to="Kill" />
|
|
||||||
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="run-affro-on-oalexstrings">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Affiliations inference (Affro)</name>
|
|
||||||
<jar>strings.py</jar>
|
|
||||||
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=4
|
|
||||||
--executor-memory=6G
|
|
||||||
--driver-memory=15G
|
|
||||||
--conf spark.executor.memoryOverhead=6G
|
|
||||||
--conf spark.sql.shuffle.partitions=20000
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
|
||||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
|
||||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
|
||||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
|
||||||
</spark-opts>
|
|
||||||
|
|
||||||
<arg>${inputFolder}</arg>
|
|
||||||
<arg>${resultFolder}</arg>
|
|
||||||
|
|
||||||
<file>${wfAppPath}/affRo/strings.py#strings.py</file>
|
|
||||||
</spark>
|
|
||||||
|
|
||||||
<ok to="End" />
|
|
||||||
<error to="Kill" />
|
|
||||||
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="run-affro-on-crossref">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Affiliations inference (Affro)</name>
|
|
||||||
<jar>crossref.py</jar>
|
|
||||||
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=4
|
|
||||||
--executor-memory=6G
|
|
||||||
--driver-memory=15G
|
|
||||||
--conf spark.executor.memoryOverhead=6G
|
|
||||||
--conf spark.sql.shuffle.partitions=20000
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
|
||||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
|
||||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
|
||||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
|
||||||
</spark-opts>
|
|
||||||
|
|
||||||
<arg>${inputFolder}</arg>
|
|
||||||
<arg>${resultFolder}</arg>
|
|
||||||
|
|
||||||
<file>${wfAppPath}/affRo/crossref.py#crossref.py</file>
|
|
||||||
</spark>
|
|
||||||
|
|
||||||
<ok to="End" />
|
|
||||||
<error to="Kill" />
|
|
||||||
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
|
||||||
</workflow-app>
|
|
|
@ -35,5 +35,6 @@ crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
||||||
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
||||||
openapcInputPath=/data/bip-affiliations/openapc-data.json
|
openapcInputPath=/data/bip-affiliations/openapc-data.json
|
||||||
dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
||||||
|
webCrawlInputPath=/data/bip-affiliations/webCrawl/
|
||||||
|
|
||||||
outputPath=/tmp/crossref-affiliations-output-v5
|
outputPath=/tmp/crossref-affiliations-output-v5
|
||||||
|
|
|
@ -21,6 +21,10 @@
|
||||||
<name>webCrawlInputPath</name>
|
<name>webCrawlInputPath</name>
|
||||||
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>publisherInputPath</name>
|
||||||
|
<description>the path where to find the inferred affiliation relations from publisher websites</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>the path where to store the actionset</description>
|
<description>the path where to store the actionset</description>
|
||||||
|
@ -117,6 +121,7 @@
|
||||||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||||
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
||||||
|
<arg>--publisherInputPath</arg><arg>${publisherInputPath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -16,11 +16,5 @@
|
||||||
"paramLongName": "hdfsNameNode",
|
"paramLongName": "hdfsNameNode",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "bp",
|
|
||||||
"paramLongName": "backupPath",
|
|
||||||
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
|
||||||
"paramRequired": true
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -30,6 +30,12 @@
|
||||||
"paramLongName": "hdfsNameNode",
|
"paramLongName": "hdfsNameNode",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "bp",
|
||||||
|
"paramLongName": "backupPath",
|
||||||
|
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,17 @@
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/Original</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/Original</arg>
|
||||||
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
|
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||||
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
</java>
|
||||||
|
<ok to="read"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="extract_correspondence">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
|
||||||
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--inputPath</arg><arg>${inputPath}/correspondence</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${inputPath}/correspondence_extracted</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="read"/>
|
<ok to="read"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -119,6 +129,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||||
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
||||||
|
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
||||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
|
@ -16,10 +16,11 @@
|
||||||
"paramLongName": "isSparkSessionManged",
|
"paramLongName": "isSparkSessionManged",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},{
|
},
|
||||||
"paramName": "nn",
|
{
|
||||||
"paramLongName": "nameNode",
|
"paramName": "nn",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramLongName": "nameNode",
|
||||||
"paramRequired": true
|
"paramDescription": "the hdfs name node",
|
||||||
}
|
"paramRequired": true
|
||||||
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
|
|
||||||
<decision name="resume_from">
|
<decision name="resume_from">
|
||||||
<switch>
|
<switch>
|
||||||
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
<case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||||
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
@ -33,6 +33,14 @@
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
|
<action name="reset_workingDir">
|
||||||
|
<fs>
|
||||||
|
<delete path="${workingDir}"/>
|
||||||
|
<mkdir path="${workingDir}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="download"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
<action name="download">
|
<action name="download">
|
||||||
<shell xmlns="uri:oozie:shell-action:0.2">
|
<shell xmlns="uri:oozie:shell-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>sourcePath</name>
|
<name>sourcePath</name>
|
||||||
|
@ -8,19 +8,40 @@
|
||||||
<name>database</name>
|
<name>database</name>
|
||||||
<description>the PDB Database Working Path</description>
|
<description>the PDB Database Working Path</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>targetPath</name>
|
<name>mdStoreOutputId</name>
|
||||||
<description>the Target Working dir path</description>
|
<description>the identifier of the cleaned MDStore</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>mdStoreManagerURI</name>
|
||||||
|
<description>the path of the cleaned mdstore</description>
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ConvertDB"/>
|
<start to="StartTransaction"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
|
<action name="StartTransaction">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||||
|
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<capture-output/>
|
||||||
|
</java>
|
||||||
|
<ok to="ConvertDB"/>
|
||||||
|
<error to="RollBack"/>
|
||||||
|
</action>
|
||||||
<action name="ConvertDB">
|
<action name="ConvertDB">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -41,11 +62,48 @@
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
<arg>--dbPath</arg><arg>${sourcePath}</arg>
|
<arg>--dbPath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--database</arg><arg>${database}</arg>
|
<arg>--database</arg><arg>${database}</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="CommitVersion"/>
|
||||||
<error to="Kill"/>
|
<error to="RollBack"/>
|
||||||
|
|
||||||
</action>
|
</action>
|
||||||
<end name="End"/>
|
<action name="CommitVersion">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>COMMIT</arg>
|
||||||
|
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="RollBack">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="Kill"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -2,5 +2,5 @@
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
|
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
|
||||||
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
|
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
|
||||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the OAF target path ", "paramRequired": true}
|
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -1,5 +1,20 @@
|
||||||
[
|
[
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
{
|
||||||
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true},
|
"paramName": "mt",
|
||||||
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true}
|
"paramLongName": "master",
|
||||||
|
"paramDescription": "should be local or yarn",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the source Path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "mo",
|
||||||
|
"paramLongName": "mdstoreOutputVersion",
|
||||||
|
"paramDescription": "the oaf path ",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
]
|
]
|
|
@ -9,34 +9,26 @@
|
||||||
<description>the Working Path</description>
|
<description>the Working Path</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>targetPath</name>
|
<name>mdStoreOutputId</name>
|
||||||
<description>the OAF MDStore Path</description>
|
<description>the identifier of the cleaned MDStore</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>mdStoreManagerURI</name>
|
||||||
<description>memory for driver process</description>
|
<description>the path of the cleaned mdstore</description>
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorMemory</name>
|
|
||||||
<description>memory for individual executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>resumeFrom</name>
|
<name>resumeFrom</name>
|
||||||
<value>DownloadEBILinks</value>
|
<value>CreateEBIDataSet</value>
|
||||||
<description>node to start</description>
|
<description>node to start</description>
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="resume_from"/>
|
<start to="StartTransaction"/>
|
||||||
|
|
||||||
<decision name="resume_from">
|
<decision name="resume_from">
|
||||||
<switch>
|
<switch>
|
||||||
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
|
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
|
||||||
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
<case to="StartTransaction">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
||||||
<default to="DownloadEBILinks"/>
|
<default to="DownloadEBILinks"/>
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
@ -77,9 +69,29 @@
|
||||||
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
|
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
|
||||||
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
|
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="CreateEBIDataSet"/>
|
<ok to="StartTransaction"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="StartTransaction">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||||
|
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
<capture-output/>
|
||||||
|
</java>
|
||||||
|
<ok to="CreateEBIDataSet"/>
|
||||||
|
<error to="RollBack"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
<action name="CreateEBIDataSet">
|
<action name="CreateEBIDataSet">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
|
@ -95,11 +107,49 @@
|
||||||
${sparkExtraOPT}
|
${sparkExtraOPT}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="CommitVersion">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>COMMIT</arg>
|
||||||
|
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="RollBack">
|
||||||
|
<java>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||||
|
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||||
|
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||||
|
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="Kill"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -407,9 +407,10 @@ object DataciteToOAFTransformation {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
if (c.affiliation.isDefined)
|
if (c.affiliation.isDefined)
|
||||||
a.setRawAffiliationString(
|
a.setAffiliation(
|
||||||
c.affiliation.get
|
c.affiliation.get
|
||||||
.filter(af => af.nonEmpty)
|
.filter(af => af.nonEmpty)
|
||||||
|
.map(af => OafMapperUtils.field(af, dataInfo))
|
||||||
.asJava
|
.asJava
|
||||||
)
|
)
|
||||||
a.setRank(idx + 1)
|
a.setRank(idx + 1)
|
||||||
|
|
|
@ -231,7 +231,7 @@ object BioDBToOAF {
|
||||||
def uniprotToOAF(input: String): List[Oaf] = {
|
def uniprotToOAF(input: String): List[Oaf] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json = parse(input)
|
lazy val json = parse(input)
|
||||||
val pid = (json \ "pid").extract[String]
|
val pid = (json \ "pid").extract[String].trim()
|
||||||
|
|
||||||
val d = new Dataset
|
val d = new Dataset
|
||||||
|
|
||||||
|
|
|
@ -2,12 +2,15 @@ package eu.dnetlib.dhp.sx.bio
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
|
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||||
|
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
||||||
|
|
||||||
object SparkTransformBioDatabaseToOAF {
|
object SparkTransformBioDatabaseToOAF {
|
||||||
|
|
||||||
|
@ -25,8 +28,13 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
|
|
||||||
val dbPath: String = parser.get("dbPath")
|
val dbPath: String = parser.get("dbPath")
|
||||||
log.info("dbPath: {}", database)
|
log.info("dbPath: {}", database)
|
||||||
val targetPath: String = parser.get("targetPath")
|
|
||||||
log.info("targetPath: {}", database)
|
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||||
|
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
||||||
|
|
||||||
|
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||||
|
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||||
|
log.info("outputBasePath: {}", outputBasePath)
|
||||||
|
|
||||||
val spark: SparkSession =
|
val spark: SparkSession =
|
||||||
SparkSession
|
SparkSession
|
||||||
|
@ -43,24 +51,28 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
case "UNIPROT" =>
|
case "UNIPROT" =>
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
||||||
targetPath
|
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||||
)
|
)
|
||||||
case "PDB" =>
|
case "PDB" =>
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
||||||
targetPath
|
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||||
)
|
)
|
||||||
case "SCHOLIX" =>
|
case "SCHOLIX" =>
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
||||||
targetPath
|
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||||
)
|
)
|
||||||
case "CROSSREF_LINKS" =>
|
case "CROSSREF_LINKS" =>
|
||||||
CollectionUtils.saveDataset(
|
CollectionUtils.saveDataset(
|
||||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
||||||
targetPath
|
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
||||||
|
val mdStoreSize = df.count
|
||||||
|
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,6 +9,9 @@ import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||||
|
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
||||||
|
|
||||||
object SparkEBILinksToOaf {
|
object SparkEBILinksToOaf {
|
||||||
|
|
||||||
|
@ -32,8 +35,13 @@ object SparkEBILinksToOaf {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
||||||
|
|
||||||
|
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||||
|
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||||
|
log.info("outputBasePath: {}", outputBasePath)
|
||||||
|
|
||||||
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
|
|
||||||
val ebLinks: Dataset[EBILinkItem] = spark.read
|
val ebLinks: Dataset[EBILinkItem] = spark.read
|
||||||
|
@ -46,7 +54,10 @@ object SparkEBILinksToOaf {
|
||||||
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
||||||
targetPath
|
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||||
)
|
)
|
||||||
|
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
||||||
|
val mdStoreSize = df.count
|
||||||
|
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||||
|
|
||||||
|
@ -39,7 +40,8 @@ public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
private static Path workingDir;
|
private static Path workingDir;
|
||||||
private static final String ID_PREFIX = "50|doi_________::";
|
private static final String ID_PREFIX = "50|doi_________::";
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelationsTest.class);
|
private static final Logger log = LoggerFactory
|
||||||
|
.getLogger(PrepareAffiliationRelationsTest.class);
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void beforeAll() throws IOException {
|
public static void beforeAll() throws IOException {
|
||||||
|
|
|
@ -77,13 +77,13 @@ public class RemapTest {
|
||||||
MapOCIdsInPids
|
MapOCIdsInPids
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-isSparkSessionManged",
|
"--isSparkSessionManged",
|
||||||
Boolean.FALSE.toString(),
|
Boolean.FALSE.toString(),
|
||||||
"-inputPath",
|
"--inputPath",
|
||||||
inputPath,
|
inputPath,
|
||||||
"-outputPath",
|
"--outputPath",
|
||||||
workingDir.toString() + "/out/",
|
workingDir.toString() + "/out/",
|
||||||
"-nameNode", "input1;input2;input3;input4;input5"
|
"--nameNode", "hdfs://localhost"
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,15 +1,44 @@
|
||||||
{"pdb": "1CW0", "title": "crystal structure analysis of very short patch repair (vsr) endonuclease in complex with a duplex dna", "authors": ["S.E.Tsutakawa", "H.Jingami", "K.Morikawa"], "doi": "10.1016/S0092-8674(00)81550-0", "pmid": "10612397"}
|
{"classification": "Signaling protein", "pdb": "5NM4", "deposition_date": "2017-04-05", "title": "A2a adenosine receptor room-temperature structure determined by serial Femtosecond crystallography", "Keywords": ["Oom-temperature", " serial crystallography", " signaling protein"], "authors": ["T.weinert", "R.cheng", "D.james", "D.gashi", "P.nogly", "K.jaeger", "M.hennig", "", "J.standfuss"], "pmid": "28912485", "doi": "10.1038/S41467-017-00630-4"}
|
||||||
{"pdb": "2CWW", "title": "crystal structure of thermus thermophilus ttha1280, a putative sam- dependent rna methyltransferase, in complex with s-adenosyl-l- homocysteine", "authors": ["A.A.Pioszak", "K.Murayama", "N.Nakagawa", "A.Ebihara", "S.Kuramitsu", "M.Shirouzu", "S.Yokoyama", "Riken Structural Genomics/proteomics Initiative (Rsgi)"], "doi": "10.1107/S1744309105029842", "pmid": "16511182"}
|
{"classification": "Oxidoreductase/oxidoreductase inhibitor", "pdb": "4KN3", "deposition_date": "2013-05-08", "title": "Structure of the y34ns91g double mutant of dehaloperoxidase from Amphitrite ornata with 2,4,6-trichlorophenol", "Keywords": ["Lobin", " oxygen storage", " peroxidase", " oxidoreductase", " oxidoreductase-", "Oxidoreductase inhibitor complex"], "authors": ["C.wang", "L.lovelace", "L.lebioda"], "pmid": "23952341", "doi": "10.1021/BI400627W"}
|
||||||
{"pdb": "6CWE", "title": "structure of alpha-gsa[8,6p] bound by cd1d and in complex with the va14vb8.2 tcr", "authors": ["J.Wang", "D.Zajonc"], "doi": null, "pmid": null}
|
{"classification": "Transport protein", "pdb": "8HKM", "deposition_date": "2022-11-27", "title": "Ion channel", "Keywords": ["On channel", " transport protein"], "authors": ["D.h.jiang", "J.t.zhang"], "pmid": "37494189", "doi": "10.1016/J.CELREP.2023.112858"}
|
||||||
{"pdb": "5CWS", "title": "crystal structure of the intact chaetomium thermophilum nsp1-nup49- nup57 channel nucleoporin heterotrimer bound to its nic96 nuclear pore complex attachment site", "authors": ["C.J.Bley", "S.Petrovic", "M.Paduch", "V.Lu", "A.A.Kossiakoff", "A.Hoelz"], "doi": "10.1126/SCIENCE.AAC9176", "pmid": "26316600"}
|
{"classification": "Signaling protein", "pdb": "6JT1", "deposition_date": "2019-04-08", "title": "Structure of human soluble guanylate cyclase in the heme oxidised State", "Keywords": ["Oluble guanylate cyclase", " signaling protein"], "authors": ["L.chen", "Y.kang", "R.liu", "J.-x.wu"], "pmid": "31514202", "doi": "10.1038/S41586-019-1584-6"}
|
||||||
{"pdb": "5CWE", "title": "structure of cyp107l2 from streptomyces avermitilis with lauric acid", "authors": ["T.-V.Pham", "S.-H.Han", "J.-H.Kim", "D.-H.Kim", "L.-W.Kang"], "doi": null, "pmid": null}
|
{"classification": "Immune system", "pdb": "7OW6", "deposition_date": "2021-06-16", "title": "Crystal structure of a tcr in complex with hla-a*11:01 bound to kras G12d peptide (vvvgadgvgk)", "Keywords": ["La", " kras", " tcr", " immune system"], "authors": ["V.karuppiah", "R.a.robinson"], "doi": "10.1038/S41467-022-32811-1"}
|
||||||
{"pdb": "7CW4", "title": "acetyl-coa acetyltransferase from bacillus cereus atcc 14579", "authors": ["J.Hong", "K.J.Kim"], "doi": "10.1016/J.BBRC.2020.09.048", "pmid": "32972748"}
|
{"classification": "Biosynthetic protein", "pdb": "5EQ8", "deposition_date": "2015-11-12", "title": "Crystal structure of medicago truncatula histidinol-phosphate Phosphatase (mthpp) in complex with l-histidinol", "Keywords": ["Istidine biosynthesis", " metabolic pathways", " dimer", " plant", "", "Biosynthetic protein"], "authors": ["M.ruszkowski", "Z.dauter"], "pmid": "26994138", "doi": "10.1074/JBC.M115.708727"}
|
||||||
{"pdb": "2CWP", "title": "crystal structure of metrs related protein from pyrococcus horikoshii", "authors": ["K.Murayama", "M.Kato-Murayama", "M.Shirouzu", "S.Yokoyama", "Riken StructuralGenomics/proteomics Initiative (Rsgi)"], "doi": null, "pmid": null}
|
{"classification": "De novo protein", "pdb": "8CWA", "deposition_date": "2022-05-18", "title": "Solution nmr structure of 8-residue rosetta-designed cyclic peptide D8.21 in cdcl3 with cis/trans switching (tc conformation, 53%)", "Keywords": ["Yclic peptide", " non natural amino acids", " cis/trans", " switch peptides", "", "De novo design", "Membrane permeability", "De novo protein"], "authors": ["T.a.ramelot", "R.tejero", "G.t.montelione"], "pmid": "36041435", "doi": "10.1016/J.CELL.2022.07.019"}
|
||||||
{"pdb": "2CW7", "title": "crystal structure of intein homing endonuclease ii", "authors": ["H.Matsumura", "H.Takahashi", "T.Inoue", "H.Hashimoto", "M.Nishioka", "S.Fujiwara", "M.Takagi", "T.Imanaka", "Y.Kai"], "doi": "10.1002/PROT.20858", "pmid": "16493661"}
|
{"classification": "Hydrolase", "pdb": "3R6M", "deposition_date": "2011-03-21", "title": "Crystal structure of vibrio parahaemolyticus yeaz", "Keywords": ["Ctin/hsp70 nucleotide-binding fold", " bacterial resuscitation", " viable", "But non-culturable state", "Resuscitation promoting factor", "Ygjd", "", "Yjee", "Vibrio parahaemolyticus", "Hydrolase"], "authors": ["A.roujeinikova", "I.aydin"], "pmid": "21858042", "doi": "10.1371/JOURNAL.PONE.0023245"}
|
||||||
{"pdb": "1CWU", "title": "brassica napus enoyl acp reductase a138g mutant complexed with nad+ and thienodiazaborine", "authors": ["A.Roujeinikova", "J.B.Rafferty", "D.W.Rice"], "doi": "10.1074/JBC.274.43.30811", "pmid": "10521472"}
|
{"classification": "Hydrolase", "pdb": "2W5J", "deposition_date": "2008-12-10", "title": "Structure of the c14-rotor ring of the proton translocating Chloroplast atp synthase", "Keywords": ["Ydrolase", " chloroplast", " atp synthase", " lipid-binding", " cf(0)", " membrane", "", "Transport", "Formylation", "Energy transduction", "Hydrogen ion transport", "", "Ion transport", "Transmembrane", "Membrane protein"], "authors": ["M.vollmar", "D.schlieper", "M.winn", "C.buechner", "G.groth"], "pmid": "19423706", "doi": "10.1074/JBC.M109.006916"}
|
||||||
{"pdb": "3CWN", "title": "escherichia coli transaldolase b mutant f178y", "authors": ["T.Sandalova", "G.Schneider", "A.Samland"], "doi": "10.1074/JBC.M803184200", "pmid": "18687684"}
|
{"classification": "De novo protein", "pdb": "4GLU", "deposition_date": "2012-08-14", "title": "Crystal structure of the mirror image form of vegf-a", "Keywords": ["-protein", " covalent dimer", " cysteine knot protein", " growth factor", " de", "Novo protein"], "authors": ["K.mandal", "M.uppalapati", "D.ault-riche", "J.kenney", "J.lowitz", "S.sidhu", "", "S.b.h.kent"], "pmid": "22927390", "doi": "10.1073/PNAS.1210483109"}
|
||||||
{"pdb": "1CWL", "title": "human cyclophilin a complexed with 4 4-hydroxy-meleu cyclosporin", "authors": ["V.Mikol", "J.Kallen", "P.Taylor", "M.D.Walkinshaw"], "doi": "10.1006/JMBI.1998.2108", "pmid": "9769216"}
|
{"classification": "Hydrolase/hydrolase inhibitor", "pdb": "3WYL", "deposition_date": "2014-09-01", "title": "Crystal structure of the catalytic domain of pde10a complexed with 5- Methoxy-3-(1-phenyl-1h-pyrazol-5-yl)-1-(3-(trifluoromethyl)phenyl) Pyridazin-4(1h)-one", "Keywords": ["Ydrolase-hydrolase inhibitor complex"], "authors": ["H.oki", "Y.hayano"], "pmid": "25384088", "doi": "10.1021/JM5013648"}
|
||||||
{"pdb": "3CW2", "title": "crystal structure of the intact archaeal translation initiation factor 2 from sulfolobus solfataricus .", "authors": ["E.A.Stolboushkina", "S.V.Nikonov", "A.D.Nikulin", "U.Blaesi", "D.J.Manstein", "R.V.Fedorov", "M.B.Garber", "O.S.Nikonov"], "doi": "10.1016/J.JMB.2008.07.039", "pmid": "18675278"}
|
{"classification": "Isomerase", "pdb": "5BOR", "deposition_date": "2015-05-27", "title": "Structure of acetobacter aceti pure-s57c, sulfonate form", "Keywords": ["Cidophile", " pure", " purine biosynthesis", " isomerase"], "authors": ["K.l.sullivan", "T.j.kappock"]}
|
||||||
{"pdb": "3CW9", "title": "4-chlorobenzoyl-coa ligase/synthetase in the thioester-forming conformation, bound to 4-chlorophenacyl-coa", "authors": ["A.S.Reger", "J.Cao", "R.Wu", "D.Dunaway-Mariano", "A.M.Gulick"], "doi": "10.1021/BI800696Y", "pmid": "18620418"}
|
{"classification": "Hydrolase", "pdb": "1X0C", "deposition_date": "2005-03-17", "title": "Improved crystal structure of isopullulanase from aspergillus niger Atcc 9642", "Keywords": ["Ullulan", " glycoside hydrolase family 49", " glycoprotein", " hydrolase"], "authors": ["M.mizuno", "T.tonozuka", "A.yamamura", "Y.miyasaka", "H.akeboshi", "S.kamitori", "", "A.nishikawa", "Y.sakano"], "pmid": "18155243", "doi": "10.1016/J.JMB.2007.11.098"}
|
||||||
{"pdb": "3CWU", "title": "crystal structure of an alka host/guest complex 2'-fluoro-2'-deoxy-1, n6-ethenoadenine:thymine base pair", "authors": ["B.R.Bowman", "S.Lee", "S.Wang", "G.L.Verdine"], "doi": "10.1016/J.STR.2008.04.012", "pmid": "18682218"}
|
{"classification": "Oxidoreductase", "pdb": "7CUP", "deposition_date": "2020-08-23", "title": "Structure of 2,5-dihydroxypridine dioxygenase from pseudomonas putida Kt2440", "Keywords": ["On-heme dioxygenase", " oxidoreductase"], "authors": ["G.q.liu", "H.z.tang"]}
|
||||||
{"pdb": "5CWF", "title": "crystal structure of de novo designed helical repeat protein dhr8", "authors": ["G.Bhabha", "D.C.Ekiert"], "doi": "10.1038/NATURE16162", "pmid": "26675729"}
|
{"classification": "Ligase", "pdb": "1VCN", "deposition_date": "2004-03-10", "title": "Crystal structure of t.th. hb8 ctp synthetase complex with sulfate Anion", "Keywords": ["Etramer", " riken structural genomics/proteomics initiative", " rsgi", "", "Structural genomics", "Ligase"], "authors": ["M.goto", "Riken structural genomics/proteomics initiative (rsgi)"], "pmid": "15296735", "doi": "10.1016/J.STR.2004.05.013"}
|
||||||
|
{"classification": "Transferase/transferase inhibitor", "pdb": "6C9V", "deposition_date": "2018-01-28", "title": "Mycobacterium tuberculosis adenosine kinase bound to (2r,3s,4r,5r)-2- (hydroxymethyl)-5-(6-(4-phenylpiperazin-1-yl)-9h-purin-9-yl) Tetrahydrofuran-3,4-diol", "Keywords": ["Ucleoside analog", " complex", " inhibitor", " structural genomics", " psi-2", "", "Protein structure initiative", "Tb structural genomics consortium", "", "Tbsgc", "Transferase-transferase inhibitor complex"], "authors": ["R.a.crespo", "Tb structural genomics consortium (tbsgc)"], "pmid": "31002508", "doi": "10.1021/ACS.JMEDCHEM.9B00020"}
|
||||||
|
{"classification": "De novo protein", "pdb": "4LPY", "deposition_date": "2013-07-16", "title": "Crystal structure of tencon variant g10", "Keywords": ["Ibronectin type iii fold", " alternate scaffold", " de novo protein"], "authors": ["A.teplyakov", "G.obmolova", "G.l.gilliland"], "pmid": "24375666", "doi": "10.1002/PROT.24502"}
|
||||||
|
{"classification": "Isomerase", "pdb": "2Y88", "deposition_date": "2011-02-03", "title": "Crystal structure of mycobacterium tuberculosis phosphoribosyl Isomerase (variant d11n) with bound prfar", "Keywords": ["Romatic amino acid biosynthesis", " isomerase", " tim-barrel", " histidine", "Biosynthesis", "Tryptophan biosynthesis"], "authors": ["J.kuper", "A.v.due", "A.geerlof", "M.wilmanns"], "pmid": "21321225", "doi": "10.1073/PNAS.1015996108"}
|
||||||
|
{"classification": "Unknown function", "pdb": "1SR0", "deposition_date": "2004-03-22", "title": "Crystal structure of signalling protein from sheep(sps-40) at 3.0a Resolution using crystal grown in the presence of polysaccharides", "Keywords": ["Ignalling protein", " involution", " unknown function"], "authors": ["D.b.srivastava", "A.s.ethayathulla", "N.singh", "J.kumar", "S.sharma", "T.p.singh"]}
|
||||||
|
{"classification": "Dna binding protein", "pdb": "3RH2", "deposition_date": "2011-04-11", "title": "Crystal structure of a tetr-like transcriptional regulator (sama_0099) From shewanella amazonensis sb2b at 2.42 a resolution", "Keywords": ["Na/rna-binding 3-helical bundle", " structural genomics", " joint center", "For structural genomics", "Jcsg", "Protein structure initiative", "Psi-", "Biology", "Dna binding protein"], "authors": ["Joint center for structural genomics (jcsg)"]}
|
||||||
|
{"classification": "Transferase", "pdb": "2WK5", "deposition_date": "2009-06-05", "title": "Structural features of native human thymidine phosphorylase And in complex with 5-iodouracil", "Keywords": ["Lycosyltransferase", " developmental protein", " angiogenesis", "", "5-iodouracil", "Growth factor", "Enzyme kinetics", "", "Differentiation", "Disease mutation", "Thymidine", "Phosphorylase", "Chemotaxis", "Transferase", "Mutagenesis", "", "Polymorphism"], "authors": ["E.mitsiki", "A.c.papageorgiou", "S.iyer", "N.thiyagarajan", "S.h.prior", "", "D.sleep", "C.finnis", "K.r.acharya"], "pmid": "19555658", "doi": "10.1016/J.BBRC.2009.06.104"}
|
||||||
|
{"classification": "Hydrolase", "pdb": "3P9Y", "deposition_date": "2010-10-18", "title": "Crystal structure of the drosophila melanogaster ssu72-pctd complex", "Keywords": ["Hosphatase", " cis proline", " lmw ptp-like fold", " rna polymerase ii ctd", "", "Hydrolase"], "authors": ["J.w.werner-allen", "P.zhou"], "pmid": "21159777", "doi": "10.1074/JBC.M110.197129"}
|
||||||
|
{"classification": "Recombination/dna", "pdb": "6OEO", "deposition_date": "2019-03-27", "title": "Cryo-em structure of mouse rag1/2 nfc complex (dna1)", "Keywords": ["(d)j recombination", " dna transposition", " rag", " scid", " recombination", "", "Recombination-dna complex"], "authors": ["X.chen", "Y.cui", "Z.h.zhou", "W.yang", "M.gellert"], "pmid": "32015552", "doi": "10.1038/S41594-019-0363-2"}
|
||||||
|
{"classification": "Hydrolase", "pdb": "4ECA", "deposition_date": "1997-02-21", "title": "Asparaginase from e. coli, mutant t89v with covalently bound aspartate", "Keywords": ["Ydrolase", " acyl-enzyme intermediate", " threonine amidohydrolase"], "authors": ["G.j.palm", "J.lubkowski", "A.wlodawer"], "pmid": "8706862", "doi": "10.1016/0014-5793(96)00660-6"}
|
||||||
|
{"classification": "Transcription/protein binding", "pdb": "3UVX", "deposition_date": "2011-11-30", "title": "Crystal structure of the first bromodomain of human brd4 in complex With a diacetylated histone 4 peptide (h4k12ack16ac)", "Keywords": ["Romodomain", " bromodomain containing protein 4", " cap", " hunk1", " mcap", "", "Mitotic chromosome associated protein", "Peptide complex", "Structural", "Genomics consortium", "Sgc", "Transcription-protein binding complex"], "authors": ["P.filippakopoulos", "S.picaud", "T.keates", "E.ugochukwu", "F.von delft", "", "C.h.arrowsmith", "A.m.edwards", "J.weigelt", "C.bountra", "S.knapp", "Structural", "Genomics consortium (sgc)"], "pmid": "22464331", "doi": "10.1016/J.CELL.2012.02.013"}
|
||||||
|
{"classification": "Membrane protein", "pdb": "1TLZ", "deposition_date": "2004-06-10", "title": "Tsx structure complexed with uridine", "Keywords": ["Ucleoside transporter", " beta barrel", " uridine", " membrane", "Protein"], "authors": ["J.ye", "B.van den berg"], "pmid": "15272310", "doi": "10.1038/SJ.EMBOJ.7600330"}
|
||||||
|
{"classification": "Dna binding protein", "pdb": "7AZD", "deposition_date": "2020-11-16", "title": "Dna polymerase sliding clamp from escherichia coli with peptide 20 Bound", "Keywords": ["Ntibacterial drug", " dna binding protein"], "authors": ["C.monsarrat", "G.compain", "C.andre", "I.martiel", "S.engilberge", "V.olieric", "", "P.wolff", "K.brillet", "M.landolfo", "C.silva da veiga", "J.wagner", "G.guichard", "", "D.y.burnouf"], "pmid": "34806883", "doi": "10.1021/ACS.JMEDCHEM.1C00918"}
|
||||||
|
{"classification": "Transferase", "pdb": "5N3K", "deposition_date": "2017-02-08", "title": "Camp-dependent protein kinase a from cricetulus griseus in complex With fragment like molecule o-guanidino-l-homoserine", "Keywords": ["Ragment", " complex", " transferase", " serine threonine kinase", " camp", "", "Kinase", "Pka"], "authors": ["C.siefker", "A.heine", "G.klebe"]}
|
||||||
|
{"classification": "Biosynthetic protein", "pdb": "8H52", "deposition_date": "2022-10-11", "title": "Crystal structure of helicobacter pylori carboxyspermidine Dehydrogenase in complex with nadp", "Keywords": ["Arboxyspermidine dehydrogenase", " biosynthetic protein"], "authors": ["K.y.ko", "S.c.park", "S.y.cho", "S.i.yoon"], "pmid": "36283333", "doi": "10.1016/J.BBRC.2022.10.049"}
|
||||||
|
{"classification": "Metal binding protein", "pdb": "6DYC", "deposition_date": "2018-07-01", "title": "Co(ii)-bound structure of the engineered cyt cb562 variant, ch3", "Keywords": ["Esigned protein", " 4-helix bundle", " electron transport", " metal binding", "Protein"], "authors": ["F.a.tezcan", "J.rittle"], "pmid": "30778140", "doi": "10.1038/S41557-019-0218-9"}
|
||||||
|
{"classification": "Protein fibril", "pdb": "6A6B", "deposition_date": "2018-06-27", "title": "Cryo-em structure of alpha-synuclein fiber", "Keywords": ["Lpha-syn fiber", " parkinson disease", " protein fibril"], "authors": ["Y.w.li", "C.y.zhao", "F.luo", "Z.liu", "X.gui", "Z.luo", "X.zhang", "D.li", "C.liu", "X.li"], "pmid": "30065316", "doi": "10.1038/S41422-018-0075-X"}
|
||||||
|
{"classification": "Dna", "pdb": "7D5E", "deposition_date": "2020-09-25", "title": "Left-handed g-quadruplex containing two bulges", "Keywords": ["-quadruplex", " bulge", " dna", " left-handed"], "authors": ["P.das", "A.maity", "K.h.ngo", "F.r.winnerdy", "B.bakalar", "Y.mechulam", "E.schmitt", "", "A.t.phan"], "pmid": "33503265", "doi": "10.1093/NAR/GKAA1259"}
|
||||||
|
{"classification": "Transferase", "pdb": "3RSY", "deposition_date": "2011-05-02", "title": "Cellobiose phosphorylase from cellulomonas uda in complex with sulfate And glycerol", "Keywords": ["H94", " alpha barrel", " cellobiose phosphorylase", " disaccharide", "Phosphorylase", "Transferase"], "authors": ["A.van hoorebeke", "J.stout", "W.soetaert", "J.van beeumen", "T.desmet", "S.savvides"]}
|
||||||
|
{"classification": "Oxidoreductase", "pdb": "7MCI", "deposition_date": "2021-04-02", "title": "Mofe protein from azotobacter vinelandii with a sulfur-replenished Cofactor", "Keywords": ["Zotobacter vinelandii", " mofe-protein", " nitrogenase", " oxidoreductase"], "authors": ["W.kang", "C.lee", "Y.hu", "M.w.ribbe"], "doi": "10.1038/S41929-022-00782-7"}
|
||||||
|
{"classification": "Dna", "pdb": "1XUW", "deposition_date": "2004-10-26", "title": "Structural rationalization of a large difference in rna affinity Despite a small difference in chemistry between two 2'-o-modified Nucleic acid analogs", "Keywords": ["Na mimetic methylcarbamate amide analog", " dna"], "authors": ["R.pattanayek", "L.sethaphong", "C.pan", "M.prhavc", "T.p.prakash", "M.manoharan", "", "M.egli"], "pmid": "15547979", "doi": "10.1021/JA044637K"}
|
||||||
|
{"classification": "Lyase", "pdb": "7C0D", "deposition_date": "2020-05-01", "title": "Crystal structure of azospirillum brasilense l-2-keto-3-deoxyarabonate Dehydratase (hydroxypyruvate-bound form)", "Keywords": ["-2-keto-3-deoxyarabonate dehydratase", " lyase"], "authors": ["Y.watanabe", "S.watanabe"], "pmid": "32697085", "doi": "10.1021/ACS.BIOCHEM.0C00515"}
|
||||||
|
{"classification": "Signaling protein", "pdb": "5LYK", "deposition_date": "2016-09-28", "title": "Crystal structure of intracellular b30.2 domain of btn3a1 bound to Citrate", "Keywords": ["30.2", " butyrophilin", " signaling protein"], "authors": ["F.mohammed", "A.t.baker", "M.salim", "B.e.willcox"], "pmid": "28862425", "doi": "10.1021/ACSCHEMBIO.7B00694"}
|
||||||
|
{"classification": "Toxin", "pdb": "4IZL", "deposition_date": "2013-01-30", "title": "Structure of the n248a mutant of the panton-valentine leucocidin s Component from staphylococcus aureus", "Keywords": ["I-component leucotoxin", " staphylococcus aureus", " s component", "Leucocidin", "Beta-barrel pore forming toxin", "Toxin"], "authors": ["L.maveyraud", "B.j.laventie", "G.prevost", "L.mourey"], "pmid": "24643034", "doi": "10.1371/JOURNAL.PONE.0092094"}
|
||||||
|
{"classification": "Dna", "pdb": "6F3C", "deposition_date": "2017-11-28", "title": "The cytotoxic [pt(h2bapbpy)] platinum complex interacting with the Cgtacg hexamer", "Keywords": ["Rug-dna complex", " four-way junction", " dna"], "authors": ["M.ferraroni", "C.bazzicalupi", "P.gratteri", "F.papi"], "pmid": "31046177", "doi": "10.1002/ANIE.201814532"}
|
||||||
|
{"classification": "Signaling protein/inhibitor", "pdb": "4L5M", "deposition_date": "2013-06-11", "title": "Complexe of arno sec7 domain with the protein-protein interaction Inhibitor n-(4-hydroxy-2,6-dimethylphenyl)benzenesulfonamide at ph6.5", "Keywords": ["Ec-7domain", " signaling protein-inhibitor complex"], "authors": ["F.hoh", "J.rouhana"], "pmid": "24112024", "doi": "10.1021/JM4009357"}
|
||||||
|
{"classification": "Signaling protein", "pdb": "5I6J", "deposition_date": "2016-02-16", "title": "Crystal structure of srgap2 f-barx", "Keywords": ["Rgap2", " f-bar", " fx", " signaling protein"], "authors": ["M.sporny", "J.guez-haddad", "M.n.isupov", "Y.opatowsky"], "pmid": "28333212", "doi": "10.1093/MOLBEV/MSX094"}
|
||||||
|
{"classification": "Metal binding protein", "pdb": "1Q80", "deposition_date": "2003-08-20", "title": "Solution structure and dynamics of nereis sarcoplasmic calcium binding Protein", "Keywords": ["Ll-alpha", " metal binding protein"], "authors": ["G.rabah", "R.popescu", "J.a.cox", "Y.engelborghs", "C.t.craescu"], "pmid": "15819893", "doi": "10.1111/J.1742-4658.2005.04629.X"}
|
||||||
|
{"classification": "Transferase", "pdb": "1TW1", "deposition_date": "2004-06-30", "title": "Beta-1,4-galactosyltransferase mutant met344his (m344h-gal-t1) complex With udp-galactose and magnesium", "Keywords": ["Et344his mutation; closed conformation; mn binding", " transferase"], "authors": ["B.ramakrishnan", "E.boeggeman", "P.k.qasba"], "pmid": "15449940", "doi": "10.1021/BI049007+"}
|
||||||
|
{"classification": "Rna", "pdb": "2PN4", "deposition_date": "2007-04-23", "title": "Crystal structure of hepatitis c virus ires subdomain iia", "Keywords": ["Cv", " ires", " subdoamin iia", " rna", " strontium", " hepatitis"], "authors": ["Q.zhao", "Q.han", "C.r.kissinger", "P.a.thompson"], "pmid": "18391410", "doi": "10.1107/S0907444908002011"}
|
|
@ -1,6 +1,36 @@
|
||||||
{"pid": "Q6GZX4", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 41."}], "title": "Putative transcription factor 001R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
{"pid": " Q6GZX4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 43"}], "title": "Putative transcription factor 001R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
{"pid": "Q6GZX3", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 42."}], "title": "Uncharacterized protein 002L;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
{"pid": " Q6GZX3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 45"}], "title": "Uncharacterized protein 002L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
{"pid": "Q197F8", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 27."}], "title": "Uncharacterized protein 002R;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
|
{"pid": " Q197F8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 29"}], "title": "Uncharacterized protein 002R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
{"pid": "Q197F7", "dates": [{"date": "16-JUN-2009", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "11-JUL-2006", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 23."}], "title": "Uncharacterized protein 003L;", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus."], "references": [{"PubMed": "16912294"}, {" DOI": "10.1128/jvi.00464-06"}]}
|
{"pid": " Q197F7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 23"}], "title": "Uncharacterized protein 003L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
{"pid": "Q6GZX2", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 36."}], "title": "Uncharacterized protein 3R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
{"pid": " Q6GZX2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 3R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
{"pid": "Q6GZX1", "dates": [{"date": "28-JUN-2011", "date_info": " integrated into UniProtKB/Swiss-Prot."}, {"date": "19-JUL-2004", "date_info": " sequence version 1."}, {"date": "12-AUG-2020", "date_info": " entry version 34."}], "title": "Uncharacterized protein 004R;", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3).", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus."], "references": [{"PubMed": "15165820"}, {" DOI": "10.1016/j.virol.2004.02.019"}]}
|
{"pid": " Q6GZX1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 38"}], "title": "Uncharacterized protein 004R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q197F5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 005L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
|
{"pid": " Q6GZX0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 47"}], "title": "Uncharacterized protein 005R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q91G88", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-06-28", "date_info": "entry version 53"}], "title": "Putative KilA-N domain-containing protein 006L", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
||||||
|
{"pid": " Q6GZW9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 34"}], "title": "Uncharacterized protein 006R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZW8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 32"}], "title": "Uncharacterized protein 007R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q197F3", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 007R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
|
{"pid": " Q197F2", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-02-23", "date_info": "entry version 22"}], "title": "Uncharacterized protein 008L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
|
{"pid": " Q6GZW6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 67"}], "title": "Putative helicase 009L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q91G85", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 38"}], "title": "Uncharacterized protein 009R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
||||||
|
{"pid": " Q6GZW5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 010R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q197E9", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 28"}], "title": "Uncharacterized protein 011L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
|
{"pid": " Q6GZW4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 011R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZW3", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 012L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q197E7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 37"}], "title": "Uncharacterized protein IIV3-013L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
|
{"pid": " Q6GZW2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 30"}], "title": "Uncharacterized protein 013R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZW1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 014R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZW0", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 50"}], "title": "Uncharacterized protein 015R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZV8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 017L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZV7", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 018L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZV6", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 87"}], "title": "Putative serine/threonine-protein kinase 019R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZV5", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 40"}], "title": "Uncharacterized protein 020R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZV4", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 35"}], "title": "Uncharacterized protein 021L", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q197D8", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-12-14", "date_info": "entry version 35"}], "title": "Transmembrane protein 022L", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
|
{"pid": " Q6GZV2", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 33"}], "title": "Uncharacterized protein 023R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q197D7", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2023-02-22", "date_info": "entry version 25"}], "title": "Uncharacterized protein 023R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
|
{"pid": " Q6GZV1", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 37"}], "title": "Uncharacterized protein 024R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q197D5", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2006-07-11", "date_info": "sequence version 1"}, {"date": "2022-10-12", "date_info": "entry version 24"}], "title": "Uncharacterized protein 025R", "organism_species": "Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Chloriridovirus"], "references": [{"PubMed": "16912294"}, {"DOI": "10.1128/jvi.00464-06"}]}
|
||||||
|
{"pid": " Q91G70", "dates": [{"date": "2009-06-16", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2001-12-01", "date_info": "sequence version 1"}, {"date": "2020-08-12", "date_info": "entry version 32"}], "title": "Uncharacterized protein 026R", "organism_species": "Invertebrate iridescent virus 6 (IIV-6) (Chilo iridescent virus)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Betairidovirinae", "Iridovirus"], "references": [{"PubMed": "17239238"}, {"DOI": "10.1186/1743-422x-4-11"}]}
|
||||||
|
{"pid": " Q6GZU9", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 49"}], "title": "Uncharacterized protein 027R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
||||||
|
{"pid": " Q6GZU8", "dates": [{"date": "2011-06-28", "date_info": "integrated into UniProtKB/Swiss-Prot"}, {"date": "2004-07-19", "date_info": "sequence version 1"}, {"date": "2023-09-13", "date_info": "entry version 55"}], "title": "Uncharacterized protein 028R", "organism_species": "Frog virus 3 (isolate Goorha) (FV-3)", "subjects": ["Viruses", "Varidnaviria", "Bamfordvirae", "Nucleocytoviricota", "Megaviricetes", "Pimascovirales", "Iridoviridae", "Alphairidovirinae", "Ranavirus"], "references": [{"PubMed": "15165820"}, {"DOI": "10.1016/j.virol.2004.02.019"}]}
|
|
@ -26,7 +26,7 @@ class MAGMappingTest {
|
||||||
@Test
|
@Test
|
||||||
def mappingMagType(): Unit = {
|
def mappingMagType(): Unit = {
|
||||||
|
|
||||||
checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = false, "Other literature type")
|
checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = true, "Other literature type")
|
||||||
checkResult[Publication](
|
checkResult[Publication](
|
||||||
MagUtility.createResultFromType(Some("BookChapter"), null),
|
MagUtility.createResultFromType(Some("BookChapter"), null),
|
||||||
invisible = false,
|
invisible = false,
|
||||||
|
|
|
@ -17,45 +17,6 @@ import eu.dnetlib.pace.tree.support.TreeStats;
|
||||||
|
|
||||||
class DecisionTreeTest {
|
class DecisionTreeTest {
|
||||||
|
|
||||||
@Test
|
|
||||||
void testJPath() throws IOException {
|
|
||||||
|
|
||||||
DedupConfig conf = DedupConfig
|
|
||||||
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json")));
|
|
||||||
|
|
||||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json"));
|
|
||||||
|
|
||||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
|
||||||
|
|
||||||
System.out.println("row = " + row);
|
|
||||||
Assertions.assertNotNull(row);
|
|
||||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
|
||||||
|
|
||||||
System.out.println("row = " + row.getAs("countrytitle"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void jsonToModelTest() throws IOException {
|
|
||||||
DedupConfig conf = DedupConfig
|
|
||||||
.load(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
SparkOpenorgsDedupTest.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
|
||||||
|
|
||||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
|
||||||
|
|
||||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
|
||||||
// to check that the same parsing returns the same row
|
|
||||||
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
|
||||||
|
|
||||||
Assertions.assertEquals(row, row1);
|
|
||||||
System.out.println("row = " + row);
|
|
||||||
Assertions.assertNotNull(row);
|
|
||||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void organizationDecisionTreeTest() throws Exception {
|
void organizationDecisionTreeTest() throws Exception {
|
||||||
DedupConfig conf = DedupConfig
|
DedupConfig conf = DedupConfig
|
||||||
|
|
|
@ -452,18 +452,18 @@ public class SparkDedupTest implements Serializable {
|
||||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||||
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||||
assertTrue(dups.contains(r.getTarget()));
|
assertFalse(dups.contains(r.getTarget()));
|
||||||
});
|
});
|
||||||
|
|
||||||
final List<Relation> mergedIn = pubs
|
final List<Relation> mergedIn = pubs
|
||||||
.filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
.filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
||||||
.collectAsList();
|
.collectAsList();
|
||||||
assertEquals(3, mergedIn.size());
|
assertEquals(1, mergedIn.size());
|
||||||
mergedIn.forEach(r -> {
|
mergedIn.forEach(r -> {
|
||||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||||
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
||||||
assertTrue(dups.contains(r.getSource()));
|
assertFalse(dups.contains(r.getSource()));
|
||||||
});
|
});
|
||||||
|
|
||||||
System.out.println("orgs_mergerel = " + orgs_mergerel);
|
System.out.println("orgs_mergerel = " + orgs_mergerel);
|
||||||
|
@ -473,8 +473,8 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("orp_mergerel = " + orp_mergerel);
|
System.out.println("orp_mergerel = " + orp_mergerel);
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(1268, orgs_mergerel);
|
assertEquals(1278, orgs_mergerel);
|
||||||
assertEquals(1156, pubs.count());
|
assertEquals(1158, pubs.count());
|
||||||
assertEquals(292, sw_mergerel);
|
assertEquals(292, sw_mergerel);
|
||||||
assertEquals(476, ds_mergerel);
|
assertEquals(476, ds_mergerel);
|
||||||
assertEquals(742, orp_mergerel);
|
assertEquals(742, orp_mergerel);
|
||||||
|
|
|
@ -241,7 +241,6 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
|
|
||||||
verifyRoot_case_1(roots, pubs);
|
verifyRoot_case_1(roots, pubs);
|
||||||
verifyRoot_case_2(roots, pubs);
|
verifyRoot_case_2(roots, pubs);
|
||||||
verifyRoot_case_3(roots, pubs);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
||||||
|
@ -322,34 +321,6 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyRoot_case_3(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
|
||||||
Publication root = roots
|
|
||||||
.filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'")
|
|
||||||
.first();
|
|
||||||
assertNotNull(root);
|
|
||||||
|
|
||||||
Publication pivot_duplicate = pubs
|
|
||||||
.filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'")
|
|
||||||
.first();
|
|
||||||
|
|
||||||
assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
|
||||||
|
|
||||||
Set<String> dups_cf = pubs
|
|
||||||
.collectAsList()
|
|
||||||
.stream()
|
|
||||||
.flatMap(p -> p.getCollectedfrom().stream())
|
|
||||||
.map(KeyValue::getValue)
|
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
|
||||||
|
|
||||||
Set<String> root_cf = root
|
|
||||||
.getCollectedfrom()
|
|
||||||
.stream()
|
|
||||||
.map(KeyValue::getValue)
|
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
|
||||||
|
|
||||||
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(6)
|
@Order(6)
|
||||||
void updateEntityTest() throws Exception {
|
void updateEntityTest() throws Exception {
|
||||||
|
|
|
@ -143,7 +143,9 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
||||||
"--graphBasePath", graphInputPath,
|
"--graphBasePath", graphInputPath,
|
||||||
"--actionSetId", testActionSetId,
|
"--actionSetId", testActionSetId,
|
||||||
"--isLookUpUrl", "lookupurl",
|
"--isLookUpUrl", "lookupurl",
|
||||||
"--workingPath", workingPath
|
"--workingPath", workingPath,
|
||||||
|
"--hiveMetastoreUris", "none",
|
||||||
|
"--pivotHistoryDatabase", ""
|
||||||
}), spark)
|
}), spark)
|
||||||
.run(isLookUpService);
|
.run(isLookUpService);
|
||||||
|
|
||||||
|
@ -153,7 +155,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
||||||
.as(Encoders.bean(Relation.class));
|
.as(Encoders.bean(Relation.class));
|
||||||
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
3, merges
|
4, merges
|
||||||
.filter("relclass == 'isMergedIn'")
|
.filter("relclass == 'isMergedIn'")
|
||||||
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
|
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
|
||||||
.distinct()
|
.distinct()
|
||||||
|
@ -178,7 +180,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
||||||
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
||||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||||
|
|
||||||
assertEquals(3, roots.count());
|
assertEquals(4, roots.count());
|
||||||
|
|
||||||
final Dataset<Publication> pubs = spark
|
final Dataset<Publication> pubs = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -195,7 +197,7 @@ public class SparkPublicationRootsTest2 implements Serializable {
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
.get(0);
|
.get(0);
|
||||||
|
|
||||||
assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue());
|
assertEquals("2022-01-01", root.getDateofacceptance().getValue());
|
||||||
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
||||||
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
|
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
|
||||||
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||||
|
|
|
@ -168,7 +168,7 @@ public class SparkStatsTest implements Serializable {
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(414, orgs_blocks);
|
assertEquals(412, orgs_blocks);
|
||||||
assertEquals(221, pubs_blocks);
|
assertEquals(221, pubs_blocks);
|
||||||
assertEquals(134, sw_blocks);
|
assertEquals(134, sw_blocks);
|
||||||
assertEquals(196, ds_blocks);
|
assertEquals(196, ds_blocks);
|
||||||
|
|
|
@ -73,12 +73,6 @@
|
||||||
"name": "Irish Nephrology Society",
|
"name": "Irish Nephrology Society",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "100011062",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/100011062",
|
|
||||||
"name": "Asian Spinal Cord Network",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "100011096",
|
"id": "100011096",
|
||||||
"uri": "http://dx.doi.org/10.13039/100011096",
|
"uri": "http://dx.doi.org/10.13039/100011096",
|
||||||
|
@ -223,12 +217,6 @@
|
||||||
"name": "Global Brain Health Institute",
|
"name": "Global Brain Health Institute",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "100015776",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/100015776",
|
|
||||||
"name": "Health and Social Care Board",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "100015992",
|
"id": "100015992",
|
||||||
"uri": "http://dx.doi.org/10.13039/100015992",
|
"uri": "http://dx.doi.org/10.13039/100015992",
|
||||||
|
@ -403,18 +391,6 @@
|
||||||
"name": "Irish Hospice Foundation",
|
"name": "Irish Hospice Foundation",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100001596",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100001596",
|
|
||||||
"name": "Irish Research Council for Science, Engineering and Technology",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "501100001597",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100001597",
|
|
||||||
"name": "Irish Research Council for the Humanities and Social Sciences",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100001598",
|
"id": "501100001598",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100001598",
|
"uri": "http://dx.doi.org/10.13039/501100001598",
|
||||||
|
@ -515,7 +491,7 @@
|
||||||
"id": "501100002081",
|
"id": "501100002081",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100002081",
|
"uri": "http://dx.doi.org/10.13039/501100002081",
|
||||||
"name": "Irish Research Council",
|
"name": "Irish Research Council",
|
||||||
"synonym": []
|
"synonym": ["501100001596", "501100001597"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "501100002736",
|
"id": "501100002736",
|
||||||
|
|
|
@ -560,7 +560,15 @@ case object Crossref2Oaf {
|
||||||
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
|
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
|
||||||
"10.13039/501100013589" | "10.13039/501100000271" =>
|
"10.13039/501100013589" | "10.13039/501100000271" =>
|
||||||
generateSimpleRelationFromAward(funder, "ukri________", a => a)
|
generateSimpleRelationFromAward(funder, "ukri________", a => a)
|
||||||
|
//HFRI
|
||||||
|
case "10.13039/501100013209" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "hfri________", a => a)
|
||||||
|
val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
|
||||||
|
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||||
|
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||||
|
//ERASMUS+
|
||||||
|
case "10.13039/501100010790" =>
|
||||||
|
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
||||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -313,7 +313,7 @@ case object ConversionUtil {
|
||||||
if (f.author.DisplayName.isDefined)
|
if (f.author.DisplayName.isDefined)
|
||||||
a.setFullname(f.author.DisplayName.get)
|
a.setFullname(f.author.DisplayName.get)
|
||||||
if (f.affiliation != null)
|
if (f.affiliation != null)
|
||||||
a.setRawAffiliationString(List(f.affiliation).asJava)
|
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||||
a.setPid(
|
a.setPid(
|
||||||
List(
|
List(
|
||||||
createSP(
|
createSP(
|
||||||
|
@ -386,7 +386,7 @@ case object ConversionUtil {
|
||||||
a.setFullname(f.author.DisplayName.get)
|
a.setFullname(f.author.DisplayName.get)
|
||||||
|
|
||||||
if (f.affiliation != null)
|
if (f.affiliation != null)
|
||||||
a.setRawAffiliationString(List(f.affiliation).asJava)
|
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||||
|
|
||||||
a.setPid(
|
a.setPid(
|
||||||
List(
|
List(
|
||||||
|
|
|
@ -13,13 +13,13 @@ public class CommunityContentprovider {
|
||||||
private String openaireId;
|
private String openaireId;
|
||||||
private SelectionConstraints selectioncriteria;
|
private SelectionConstraints selectioncriteria;
|
||||||
|
|
||||||
private String enabled;
|
private Boolean enabled;
|
||||||
|
|
||||||
public String getEnabled() {
|
public Boolean getEnabled() {
|
||||||
return enabled;
|
return enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setEnabled(String enabled) {
|
public void setEnabled(Boolean enabled) {
|
||||||
this.enabled = enabled;
|
this.enabled = enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,6 +53,8 @@ public class Constraints implements Serializable {
|
||||||
|
|
||||||
for (Constraint sc : constraint) {
|
for (Constraint sc : constraint) {
|
||||||
boolean verified = false;
|
boolean verified = false;
|
||||||
|
if (!param.containsKey(sc.getField()))
|
||||||
|
return false;
|
||||||
for (String value : param.get(sc.getField())) {
|
for (String value : param.get(sc.getField())) {
|
||||||
if (sc.verifyCriteria(value.trim())) {
|
if (sc.verifyCriteria(value.trim())) {
|
||||||
verified = true;
|
verified = true;
|
||||||
|
|
|
@ -14,6 +14,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -84,19 +85,26 @@ public class SparkCountryPropagationJob {
|
||||||
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
|
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
|
||||||
|
|
||||||
log.info("Reading prepared info: {}", preparedInfoPath);
|
log.info("Reading prepared info: {}", preparedInfoPath);
|
||||||
Dataset<ResultCountrySet> prepared = spark
|
final Dataset<Row> preparedInfoRaw = spark
|
||||||
.read()
|
.read()
|
||||||
.json(preparedInfoPath)
|
.json(preparedInfoPath);
|
||||||
.as(Encoders.bean(ResultCountrySet.class));
|
|
||||||
|
|
||||||
res
|
|
||||||
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
|
|
||||||
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
|
|
||||||
.write()
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.json(outputPath);
|
|
||||||
|
|
||||||
|
if (!preparedInfoRaw.isEmpty()) {
|
||||||
|
final Dataset<ResultCountrySet> prepared = preparedInfoRaw.as(Encoders.bean(ResultCountrySet.class));
|
||||||
|
res
|
||||||
|
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
|
||||||
|
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputPath);
|
||||||
|
} else {
|
||||||
|
res
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -147,6 +147,7 @@ public class CleanGraphSparkJob {
|
||||||
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
|
||||||
|
.map((MapFunction<T, T>) GraphCleaningFunctions::dedicatedUglyHacks, Encoders.bean(clazz))
|
||||||
.filter((FilterFunction<T>) GraphCleaningFunctions::filter);
|
.filter((FilterFunction<T>) GraphCleaningFunctions::filter);
|
||||||
|
|
||||||
// read the master-duplicate tuples
|
// read the master-duplicate tuples
|
||||||
|
|
|
@ -9,7 +9,10 @@ import java.util.Optional;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -22,6 +25,8 @@ public class GraphHiveTableImporterJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
|
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -69,12 +74,7 @@ public class GraphHiveTableImporterJob {
|
||||||
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
|
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
|
||||||
Class<T> clazz, int numPartitions) {
|
Class<T> clazz, int numPartitions) {
|
||||||
|
|
||||||
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
|
Dataset<String> dataset = spark.read().textFile(inputPath);
|
||||||
|
|
||||||
Dataset<Row> dataset = spark
|
|
||||||
.read()
|
|
||||||
.schema(clazzEncoder.schema())
|
|
||||||
.json(inputPath);
|
|
||||||
|
|
||||||
if (numPartitions > 0) {
|
if (numPartitions > 0) {
|
||||||
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
||||||
|
@ -82,6 +82,7 @@ public class GraphHiveTableImporterJob {
|
||||||
}
|
}
|
||||||
|
|
||||||
dataset
|
dataset
|
||||||
|
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.saveAsTable(tableIdentifier(hiveDbName, clazz));
|
.saveAsTable(tableIdentifier(hiveDbName, clazz));
|
||||||
|
|
|
@ -94,7 +94,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
|
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
|
||||||
}
|
}
|
||||||
|
|
||||||
author.setRawAffiliationString(prepareListString(n, "./*[local-name()='affiliation']"));
|
author.setAffiliation(prepareListFields(n, "./*[local-name()='affiliation']", info));
|
||||||
author.setPid(preparePids(n, info));
|
author.setPid(preparePids(n, info));
|
||||||
author.setRank(pos++);
|
author.setRank(pos++);
|
||||||
res.add(author);
|
res.add(author);
|
||||||
|
|
|
@ -85,7 +85,7 @@
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<fork name="fork_downloads_csv">
|
<fork name="fork_downloads_csv">
|
||||||
<path start="download_gold"/>
|
<path start="download_gold"/>
|
||||||
<path start="download_doaj_json"/>
|
<path start="download_doaj_json"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
@ -223,11 +223,13 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
|
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
|
@ -253,11 +255,13 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
|
@ -278,6 +282,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -73,10 +73,14 @@ public class GraphHiveImporterJobTest {
|
||||||
GraphHiveImporterJob
|
GraphHiveImporterJob
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
"-isSparkSessionManaged",
|
||||||
"--inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
|
Boolean.FALSE.toString(),
|
||||||
"--hiveMetastoreUris", "",
|
"-inputPath",
|
||||||
"--hiveDbName", dbName
|
getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
|
||||||
|
"-hiveMetastoreUris",
|
||||||
|
"",
|
||||||
|
"-hiveDbName",
|
||||||
|
dbName
|
||||||
});
|
});
|
||||||
|
|
||||||
ModelSupport.oafTypes
|
ModelSupport.oafTypes
|
||||||
|
|
|
@ -406,15 +406,15 @@ class MappersTest {
|
||||||
assertEquals("Baracchini", author.get().getSurname());
|
assertEquals("Baracchini", author.get().getSurname());
|
||||||
assertEquals("Theo", author.get().getName());
|
assertEquals("Theo", author.get().getName());
|
||||||
|
|
||||||
assertEquals(1, author.get().getRawAffiliationString().size());
|
assertEquals(1, author.get().getAffiliation().size());
|
||||||
final Optional<String> opAff = author
|
final Optional<Field<String>> opAff = author
|
||||||
.get()
|
.get()
|
||||||
.getRawAffiliationString()
|
.getAffiliation()
|
||||||
.stream()
|
.stream()
|
||||||
.findFirst();
|
.findFirst();
|
||||||
assertTrue(opAff.isPresent());
|
assertTrue(opAff.isPresent());
|
||||||
final String affiliation = opAff.get();
|
final Field<String> affiliation = opAff.get();
|
||||||
assertEquals("ISTI-CNR", affiliation);
|
assertEquals("ISTI-CNR", affiliation.getValue());
|
||||||
|
|
||||||
assertFalse(d.getSubject().isEmpty());
|
assertFalse(d.getSubject().isEmpty());
|
||||||
assertFalse(d.getInstance().isEmpty());
|
assertFalse(d.getInstance().isEmpty());
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -31,11 +31,5 @@ class ORCIDAuthorMatchersTest {
|
||||||
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
||||||
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
||||||
}
|
}
|
||||||
@Test def testDocumentationNames(): Unit = {
|
|
||||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test def testDocumentationNames2(): Unit = {
|
|
||||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller Jones"))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="oaiphm_provision"/>
|
<start to="irish_oaiphm_provision"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
|
|
@ -67,7 +67,7 @@ public class PrepareRelationsJobTest {
|
||||||
@Test
|
@Test
|
||||||
void testRunPrepareRelationsJob(@TempDir Path testPath) throws Exception {
|
void testRunPrepareRelationsJob(@TempDir Path testPath) throws Exception {
|
||||||
|
|
||||||
final int maxRelations = 20;
|
final int maxRelations = 5;
|
||||||
PrepareRelationsJob
|
PrepareRelationsJob
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
|
@ -86,7 +86,7 @@ public class PrepareRelationsJobTest {
|
||||||
.as(Encoders.bean(Relation.class))
|
.as(Encoders.bean(Relation.class))
|
||||||
.cache();
|
.cache();
|
||||||
|
|
||||||
assertEquals(maxRelations, out.count());
|
assertEquals(44, out.count());
|
||||||
|
|
||||||
Dataset<Row> freq = out
|
Dataset<Row> freq = out
|
||||||
.toDF()
|
.toDF()
|
||||||
|
@ -101,12 +101,8 @@ public class PrepareRelationsJobTest {
|
||||||
long affiliation = getRows(freq, AFFILIATION).get(0).getAs("count");
|
long affiliation = getRows(freq, AFFILIATION).get(0).getAs("count");
|
||||||
|
|
||||||
assertEquals(outcome, participation);
|
assertEquals(outcome, participation);
|
||||||
assertTrue(outcome > affiliation);
|
assertEquals(outcome, affiliation);
|
||||||
assertTrue(participation > affiliation);
|
assertEquals(4, affiliation);
|
||||||
|
|
||||||
assertEquals(7, outcome);
|
|
||||||
assertEquals(7, participation);
|
|
||||||
assertEquals(6, affiliation);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected List<Row> getRows(Dataset<Row> freq, String col) {
|
protected List<Row> getRows(Dataset<Row> freq, String col) {
|
||||||
|
|
|
@ -91,9 +91,6 @@ class SolrRecordDumpJobTest {
|
||||||
public void prepareMocks() throws ISLookUpException, IOException {
|
public void prepareMocks() throws ISLookUpException, IOException {
|
||||||
isLookupClient.setIsLookup(isLookUpService);
|
isLookupClient.setIsLookup(isLookUpService);
|
||||||
|
|
||||||
Mockito
|
|
||||||
.when(isLookupClient.getDsId(Mockito.anyString()))
|
|
||||||
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
|
|
||||||
Mockito
|
Mockito
|
||||||
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
|
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
|
||||||
.thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
|
.thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
|
||||||
|
|
|
@ -48,16 +48,25 @@
|
||||||
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
|
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
|
||||||
<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
|
<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
|
||||||
<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
|
<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
|
||||||
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
|
<case to="clear-working-dir">${wf:conf('resume') eq "start"}</case>
|
||||||
|
|
||||||
<!-- Aggregation of impact scores on the project level -->
|
<!-- Aggregation of impact scores on the project level -->
|
||||||
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
|
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
|
||||||
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
|
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
|
||||||
|
|
||||||
<default to="create-openaire-ranking-graph" />
|
<default to="clear-working-dir" />
|
||||||
</switch>
|
</switch>
|
||||||
</decision>
|
</decision>
|
||||||
|
|
||||||
|
<action name="clear-working-dir">
|
||||||
|
<fs>
|
||||||
|
<delete path="${workingDir}"/>
|
||||||
|
<mkdir path="${workingDir}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="create-openaire-ranking-graph"/>
|
||||||
|
<error to="clear-working-dir-fail"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<!-- initial step: create citation network -->
|
<!-- initial step: create citation network -->
|
||||||
<action name="create-openaire-ranking-graph">
|
<action name="create-openaire-ranking-graph">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
@ -618,6 +627,10 @@
|
||||||
<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
|
<kill name="clear-working-dir-fail">
|
||||||
|
<message>Re-create working dir failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
<!-- Define ending node -->
|
<!-- Define ending node -->
|
||||||
<end name="end" />
|
<end name="end" />
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ select distinct * from (
|
||||||
from SOURCE.result r
|
from SOURCE.result r
|
||||||
join SOURCE.result_projects rp on rp.id=r.id
|
join SOURCE.result_projects rp on rp.id=r.id
|
||||||
join SOURCE.project p on p.id=rp.project
|
join SOURCE.project p on p.id=rp.project
|
||||||
join TARGET.irish_funders irf on irf.funder=p.funder
|
join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder
|
||||||
union all
|
union all
|
||||||
select r.*
|
select r.*
|
||||||
from SOURCE.result r
|
from SOURCE.result r
|
||||||
|
|
|
@ -1,3 +1,79 @@
|
||||||
|
--drop database if exists TARGET cascade;
|
||||||
|
--create database if not exists TARGET;
|
||||||
|
--
|
||||||
|
--create view if not exists TARGET.category as select * from SOURCE.category;
|
||||||
|
--create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||||
|
--create view if not exists TARGET.context as select * from SOURCE.context;
|
||||||
|
--create view if not exists TARGET.country as select * from SOURCE.country;
|
||||||
|
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||||
|
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||||
|
--create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||||
|
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||||
|
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||||
|
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||||
|
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||||
|
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||||
|
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||||
|
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||||
|
--
|
||||||
|
--create table TARGET.result stored as parquet as
|
||||||
|
-- select distinct * from (
|
||||||
|
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||||
|
-- union all
|
||||||
|
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||||
|
-- union all
|
||||||
|
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||||
|
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||||
|
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||||
|
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||||
|
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||||
|
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||||
|
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||||
|
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||||
|
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||||
|
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||||
|
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||||
|
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||||
|
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||||
|
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||||
|
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||||
|
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||||
|
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||||
|
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||||
|
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||||
|
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||||
|
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||||
|
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||||
|
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||||
|
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||||
|
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||||
|
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||||
|
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||||
|
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||||
|
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||||
|
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||||
|
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||||
|
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||||
|
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||||
|
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||||
|
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||||
|
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||||
|
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||||
|
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||||
|
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||||
|
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
|
||||||
|
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||||
|
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||||
|
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||||
|
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||||
|
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
|
||||||
|
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
|
||||||
|
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
|
||||||
|
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
|
||||||
|
-- ) )) foo;
|
||||||
|
--
|
||||||
|
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||||
|
|
||||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||||
|
|
|
@ -81,17 +81,7 @@ create table TARGET.result stored as parquet as
|
||||||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
|
||||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
|
||||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
|
||||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
|
||||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
|
||||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
|
||||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
|
||||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
|
||||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
|
||||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
|
||||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
|
||||||
))) foo;
|
))) foo;
|
||||||
|
|
||||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||||
|
|
|
@ -61,17 +61,7 @@ create table TARGET.result stored as parquet as
|
||||||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
|
||||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
|
||||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
|
||||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
|
||||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
|
||||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
|
||||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
|
||||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
|
||||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
|
||||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
|
||||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
|
||||||
))) foo;
|
))) foo;
|
||||||
|
|
||||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
|
@ -1,18 +0,0 @@
|
||||||
# Install the whole "dnet-hadoop" project.
|
|
||||||
|
|
||||||
# Delete this module's previous build-files in order to avoid any conflicts.
|
|
||||||
rm -rf target/ ||
|
|
||||||
|
|
||||||
# Go to the root directory of this project.
|
|
||||||
cd ../../
|
|
||||||
|
|
||||||
# Select the build profile.
|
|
||||||
DEFAULT_PROFILE='' # It's the empty profile.
|
|
||||||
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
|
|
||||||
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
|
|
||||||
|
|
||||||
# Install the project.
|
|
||||||
mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true
|
|
||||||
|
|
||||||
# We skip tests for all modules, since the take a big amount of time and some of them fail.
|
|
||||||
# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script.
|
|
|
@ -1,20 +0,0 @@
|
||||||
# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file.
|
|
||||||
|
|
||||||
# Select the build profile.
|
|
||||||
DEFAULT_PROFILE='' # It's the empty profile.
|
|
||||||
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
|
|
||||||
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
|
|
||||||
|
|
||||||
# Build and deploy this module.
|
|
||||||
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
|
|
||||||
-Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats
|
|
||||||
|
|
||||||
# Show the Oozie-job-ID.
|
|
||||||
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
|
|
||||||
cat ./target/extract-and-run-on-remote-host.log
|
|
||||||
|
|
||||||
# Check oozie workflow status
|
|
||||||
# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info <workflow-ID>
|
|
||||||
|
|
||||||
# Get the <job-ID> from the previous output and check the logs:
|
|
||||||
# yarn logs -applicationId application_<job-ID>
|
|
|
@ -1,10 +1,8 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
|
||||||
|
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
-- Stats database creation
|
-- Stats database creation
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
|
|
||||||
DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/
|
DROP database IF EXISTS ${stats_db_name} CASCADE;
|
||||||
CREATE database ${stats_db_name}; /*EOS*/
|
CREATE database ${stats_db_name};
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
|
||||||
|
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
|
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
|
||||||
|
@ -7,27 +5,27 @@ set mapred.job.queue.name=analytics; /*EOS*/
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM ${external_stats_db_name}.fundref; /*EOS*/
|
FROM ${external_stats_db_name}.fundref;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.country AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.country AS
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM ${external_stats_db_name}.country; /*EOS*/
|
FROM ${external_stats_db_name}.country;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM ${external_stats_db_name}.countrygdp; /*EOS*/
|
FROM ${external_stats_db_name}.countrygdp;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM ${external_stats_db_name}.roarmap; /*EOS*/
|
FROM ${external_stats_db_name}.roarmap;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM ${external_stats_db_name}.rndexpediture; /*EOS*/
|
FROM ${external_stats_db_name}.rndexpediture;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
|
FROM ${external_stats_db_name}.licenses_normalized;
|
||||||
|
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
|
@ -35,23 +33,23 @@ FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
create or replace view ${stats_db_name}.usage_stats as
|
create or replace view ${stats_db_name}.usage_stats as
|
||||||
select * from openaire_prod_usage_stats.usage_stats; /*EOS*/
|
select * from openaire_prod_usage_stats.usage_stats;
|
||||||
|
|
||||||
create or replace view ${stats_db_name}.downloads_stats as
|
create or replace view ${stats_db_name}.downloads_stats as
|
||||||
select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/
|
select * from openaire_prod_usage_stats.downloads_stats;
|
||||||
|
|
||||||
create or replace view ${stats_db_name}.pageviews_stats as
|
create or replace view ${stats_db_name}.pageviews_stats as
|
||||||
select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/
|
select * from openaire_prod_usage_stats.pageviews_stats;
|
||||||
|
|
||||||
create or replace view ${stats_db_name}.views_stats as
|
create or replace view ${stats_db_name}.views_stats as
|
||||||
select * from openaire_prod_usage_stats.views_stats; /*EOS*/
|
select * from openaire_prod_usage_stats.views_stats;
|
||||||
|
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
-- Creation date of the database
|
-- Creation date of the database
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
------------------------------------------------------------------------------------------------
|
------------------------------------------------------------------------------------------------
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge;
|
||||||
|
|
||||||
create table ${stats_db_name}.creation_date STORED AS PARQUET as
|
create table ${stats_db_name}.creation_date STORED AS PARQUET as
|
||||||
select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/
|
select date_format(current_date(), 'dd-MM-yyyy') as date;
|
||||||
|
|
|
@ -1,11 +1,110 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
|
||||||
|
|
||||||
----------------------------------------------------------------
|
----------------------------------------------------------------
|
||||||
----------------------------------------------------------------
|
----------------------------------------------------------------
|
||||||
-- Post processing - Updates on main tables
|
-- Post processing - Updates on main tables
|
||||||
----------------------------------------------------------------
|
----------------------------------------------------------------
|
||||||
----------------------------------------------------------------
|
----------------------------------------------------------------
|
||||||
|
|
||||||
|
--Datasource temporary table updates
|
||||||
|
UPDATE ${stats_db_name}.datasource_tmp
|
||||||
|
SET harvested='true'
|
||||||
|
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
|
||||||
|
FROM ${stats_db_name}.datasource_tmp d,
|
||||||
|
${stats_db_name}.result_datasources rd
|
||||||
|
WHERE d.id = rd.datasource);
|
||||||
|
|
||||||
|
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
|
||||||
|
UPDATE ${stats_db_name}.project_tmp
|
||||||
|
SET haspubs='yes'
|
||||||
|
WHERE project_tmp.id IN (SELECT pr.id
|
||||||
|
FROM ${stats_db_name}.project_results pr,
|
||||||
|
${stats_db_name}.result r
|
||||||
|
WHERE pr.result = r.id
|
||||||
|
AND r.type = 'publication');
|
||||||
|
DROP TABLE IF EXISTS ${stats_db_name}.stored purge;
|
||||||
|
|
||||||
|
CREATE TABLE ${stats_db_name}.project stored as parquet as
|
||||||
|
SELECT p.id,
|
||||||
|
p.acronym,
|
||||||
|
p.title,
|
||||||
|
p.funder,
|
||||||
|
p.funding_lvl0,
|
||||||
|
p.funding_lvl1,
|
||||||
|
p.funding_lvl2,
|
||||||
|
p.ec39,
|
||||||
|
p.type,
|
||||||
|
p.startdate,
|
||||||
|
p.enddate,
|
||||||
|
p.start_year,
|
||||||
|
p.end_year,
|
||||||
|
p.duration,
|
||||||
|
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
|
||||||
|
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
|
||||||
|
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
|
||||||
|
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
|
||||||
|
p.callidentifier,
|
||||||
|
p.code,
|
||||||
|
p.totalcost,
|
||||||
|
p.fundedamount,
|
||||||
|
p.currency
|
||||||
|
FROM ${stats_db_name}.project_tmp p
|
||||||
|
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
|
||||||
|
FROM ${stats_db_name}.project_results pr
|
||||||
|
INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
|
||||||
|
WHERE r.type = 'publication'
|
||||||
|
GROUP BY pr.id) AS prr1 on prr1.id = p.id
|
||||||
|
LEFT JOIN (SELECT pp.id,
|
||||||
|
max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
|
||||||
|
count(distinct r.id) AS dp
|
||||||
|
FROM ${stats_db_name}.project_tmp pp,
|
||||||
|
${stats_db_name}.project_results pr,
|
||||||
|
${stats_db_name}.result r
|
||||||
|
WHERE pp.id = pr.id
|
||||||
|
AND pr.result = r.id
|
||||||
|
AND r.type = 'publication'
|
||||||
|
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
|
||||||
|
GROUP BY pp.id) AS prr2
|
||||||
|
ON prr2.id = p.id;
|
||||||
|
|
||||||
|
UPDATE ${stats_db_name}.publication_tmp
|
||||||
|
SET delayed = 'yes'
|
||||||
|
WHERE publication_tmp.id IN (SELECT distinct r.id
|
||||||
|
FROM ${stats_db_name}.result r,
|
||||||
|
${stats_db_name}.project_results pr,
|
||||||
|
${stats_db_name}.project_tmp p
|
||||||
|
WHERE r.id = pr.result
|
||||||
|
AND pr.id = p.id
|
||||||
|
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||||
|
|
||||||
|
UPDATE ${stats_db_name}.dataset_tmp
|
||||||
|
SET delayed = 'yes'
|
||||||
|
WHERE dataset_tmp.id IN (SELECT distinct r.id
|
||||||
|
FROM ${stats_db_name}.result r,
|
||||||
|
${stats_db_name}.project_results pr,
|
||||||
|
${stats_db_name}.project_tmp p
|
||||||
|
WHERE r.id = pr.result
|
||||||
|
AND pr.id = p.id
|
||||||
|
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||||
|
|
||||||
|
UPDATE ${stats_db_name}.software_tmp
|
||||||
|
SET delayed = 'yes'
|
||||||
|
WHERE software_tmp.id IN (SELECT distinct r.id
|
||||||
|
FROM ${stats_db_name}.result r,
|
||||||
|
${stats_db_name}.project_results pr,
|
||||||
|
${stats_db_name}.project_tmp p
|
||||||
|
WHERE r.id = pr.result
|
||||||
|
AND pr.id = p.id
|
||||||
|
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||||
|
|
||||||
|
UPDATE ${stats_db_name}.otherresearchproduct_tmp
|
||||||
|
SET delayed = 'yes'
|
||||||
|
WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
|
||||||
|
FROM ${stats_db_name}.result r,
|
||||||
|
${stats_db_name}.project_results pr,
|
||||||
|
${stats_db_name}.project_tmp p
|
||||||
|
WHERE r.id = pr.result
|
||||||
|
AND pr.id = p.id
|
||||||
|
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
|
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
|
||||||
SELECT result_projects.id AS result,
|
SELECT result_projects.id AS result,
|
||||||
result_projects.project AS project_results,
|
result_projects.project AS project_results,
|
||||||
|
@ -17,4 +116,4 @@ FROM ${stats_db_name}.result_projects,
|
||||||
${stats_db_name}.project
|
${stats_db_name}.project
|
||||||
WHERE result_projects.id = result.id
|
WHERE result_projects.id = result.id
|
||||||
AND result.type = 'publication'
|
AND result.type = 'publication'
|
||||||
AND project.id = result_projects.project; /*EOS*/
|
AND project.id = result_projects.project;
|
|
@ -1,4 +1,42 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
------------------------------------------------------------------------------------------------------
|
||||||
|
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
|
||||||
|
------------------------------------------------------------------------------------------------------
|
||||||
|
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge;
|
||||||
|
|
||||||
|
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
|
||||||
|
SELECT *
|
||||||
|
FROM ${stats_db_name}.datasource_tmp;
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS ${stats_db_name}.publication purge;
|
||||||
|
|
||||||
|
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
|
||||||
|
SELECT *
|
||||||
|
FROM ${stats_db_name}.publication_tmp;
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge;
|
||||||
|
|
||||||
|
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
|
||||||
|
SELECT *
|
||||||
|
FROM ${stats_db_name}.dataset_tmp;
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS ${stats_db_name}.software purge;
|
||||||
|
|
||||||
|
CREATE TABLE ${stats_db_name}.software stored AS parquet AS
|
||||||
|
SELECT *
|
||||||
|
FROM ${stats_db_name}.software_tmp;
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge;
|
||||||
|
|
||||||
|
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
|
||||||
|
SELECT *
|
||||||
|
FROM ${stats_db_name}.otherresearchproduct_tmp;
|
||||||
|
|
||||||
|
DROP TABLE ${stats_db_name}.project_tmp;
|
||||||
|
DROP TABLE ${stats_db_name}.datasource_tmp;
|
||||||
|
DROP TABLE ${stats_db_name}.publication_tmp;
|
||||||
|
DROP TABLE ${stats_db_name}.dataset_tmp;
|
||||||
|
DROP TABLE ${stats_db_name}.software_tmp;
|
||||||
|
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
|
||||||
|
|
||||||
----------------------------------------------
|
----------------------------------------------
|
||||||
-- Re-creating views from final parquet tables
|
-- Re-creating views from final parquet tables
|
||||||
|
@ -16,4 +54,4 @@ SELECT *, bestlicence AS access_mode
|
||||||
FROM ${stats_db_name}.dataset
|
FROM ${stats_db_name}.dataset
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT *, bestlicence AS access_mode
|
SELECT *, bestlicence AS access_mode
|
||||||
FROM ${stats_db_name}.otherresearchproduct; /*EOS*/
|
FROM ${stats_db_name}.otherresearchproduct;
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
|
||||||
|
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
-- Additional relations
|
-- Additional relations
|
||||||
|
@ -7,10 +5,10 @@ set mapred.job.queue.name=analytics; /*EOS*/
|
||||||
-- Sources related tables/views
|
-- Sources related tables/views
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
|
||||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||||
FROM (
|
FROM (
|
||||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||||
from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p
|
from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||||
|
@ -18,12 +16,12 @@ LEFT OUTER JOIN
|
||||||
(
|
(
|
||||||
SELECT substr(d.id, 4) id
|
SELECT substr(d.id, 4) id
|
||||||
from ${openaire_db_name}.datasource d
|
from ${openaire_db_name}.datasource d
|
||||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
|
||||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||||
FROM (
|
FROM (
|
||||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||||
from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p
|
from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||||
|
@ -31,12 +29,12 @@ LEFT OUTER JOIN
|
||||||
(
|
(
|
||||||
SELECT substr(d.id, 4) id
|
SELECT substr(d.id, 4) id
|
||||||
from ${openaire_db_name}.datasource d
|
from ${openaire_db_name}.datasource d
|
||||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
|
||||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||||
FROM (
|
FROM (
|
||||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||||
from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p
|
from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||||
|
@ -44,12 +42,12 @@ LEFT OUTER JOIN
|
||||||
(
|
(
|
||||||
SELECT substr(d.id, 4) id
|
SELECT substr(d.id, 4) id
|
||||||
from ${openaire_db_name}.datasource d
|
from ${openaire_db_name}.datasource d
|
||||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
|
||||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||||
FROM (
|
FROM (
|
||||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||||
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p
|
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||||
|
@ -57,7 +55,7 @@ LEFT OUTER JOIN
|
||||||
(
|
(
|
||||||
SELECT substr(d.id, 4) id
|
SELECT substr(d.id, 4) id
|
||||||
from ${openaire_db_name}.datasource d
|
from ${openaire_db_name}.datasource d
|
||||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||||
|
|
||||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
|
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
|
||||||
SELECT * FROM ${stats_db_name}.publication_sources
|
SELECT * FROM ${stats_db_name}.publication_sources
|
||||||
|
@ -66,24 +64,24 @@ SELECT * FROM ${stats_db_name}.dataset_sources
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT * FROM ${stats_db_name}.software_sources
|
SELECT * FROM ${stats_db_name}.software_sources
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/
|
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
|
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
|
||||||
from (
|
from (
|
||||||
SELECT substr(res.id, 4) as id, auth_pid.value as orcid
|
SELECT substr(res.id, 4) as id, auth_pid.value as orcid
|
||||||
FROM ${openaire_db_name}.result res
|
FROM ${openaire_db_name}.result res
|
||||||
LATERAL VIEW explode(author) a as auth
|
LATERAL VIEW explode(author) a as auth
|
||||||
LATERAL VIEW explode(auth.pid) ap as auth_pid
|
LATERAL VIEW explode(auth.pid) ap as auth_pid
|
||||||
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
|
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
|
||||||
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/
|
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
|
||||||
select /*+ COALESCE(100) */ substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
|
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
|
||||||
from ${openaire_db_name}.relation rel
|
from ${openaire_db_name}.relation rel
|
||||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||||
|
@ -93,12 +91,12 @@ where reltype='resultResult'
|
||||||
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
|
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
|
||||||
and r1.resulttype.classname != 'other'
|
and r1.resulttype.classname != 'other'
|
||||||
and r2.resulttype.classname != 'other'
|
and r2.resulttype.classname != 'other'
|
||||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/
|
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
|
||||||
select /*+ COALESCE(100) */ substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
||||||
from ${openaire_db_name}.relation rel
|
from ${openaire_db_name}.relation rel
|
||||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||||
|
@ -110,12 +108,12 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
|
||||||
and r1.resulttype.classname != 'other'
|
and r1.resulttype.classname != 'other'
|
||||||
and r2.resulttype.classname != 'other'
|
and r2.resulttype.classname != 'other'
|
||||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||||
group by substr(target, 4); /*EOS*/
|
group by substr(target, 4);
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
|
||||||
select /*+ COALESCE(100) */ substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
||||||
from ${openaire_db_name}.relation rel
|
from ${openaire_db_name}.relation rel
|
||||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||||
|
@ -127,4 +125,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
|
||||||
and r1.resulttype.classname != 'other'
|
and r1.resulttype.classname != 'other'
|
||||||
and r2.resulttype.classname != 'other'
|
and r2.resulttype.classname != 'other'
|
||||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||||
group by substr(source, 4); /*EOS*/
|
group by substr(source, 4);
|
|
@ -1,5 +1,4 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
-- Additional relations
|
-- Additional relations
|
||||||
|
@ -7,33 +6,33 @@ set mapred.job.queue.name=analytics; /*EOS*/
|
||||||
-- Licences related tables/views
|
-- Licences related tables/views
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||||
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
|
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||||
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
|
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||||
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
|
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||||
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
|
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||||
|
|
||||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
|
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
|
||||||
SELECT * FROM ${stats_db_name}.publication_licenses
|
SELECT * FROM ${stats_db_name}.publication_licenses
|
||||||
|
@ -42,29 +41,29 @@ SELECT * FROM ${stats_db_name}.dataset_licenses
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT * FROM ${stats_db_name}.software_licenses
|
SELECT * FROM ${stats_db_name}.software_licenses
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/
|
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
|
||||||
select /*+ COALESCE(100) */ substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
|
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
|
||||||
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/
|
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
|
||||||
SELECT /*+ COALESCE(100) */ o.id, case when d.id is null then 'other' else o.datasource end as datasource
|
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource
|
||||||
FROM (
|
FROM (
|
||||||
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
|
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
|
||||||
from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o
|
from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o
|
||||||
LEFT OUTER JOIN (
|
LEFT OUTER JOIN (
|
||||||
SELECT substr(d.id, 4) id
|
SELECT substr(d.id, 4) id
|
||||||
from ${openaire_db_name}.datasource d
|
from ${openaire_db_name}.datasource d
|
||||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/
|
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
|
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
|
||||||
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
|
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
|
||||||
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/
|
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
|
@ -8,7 +8,7 @@ set mapred.job.queue.name=analytics; /*EOS*/
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge;
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
|
||||||
with peer_reviewed as (
|
with peer_reviewed as (
|
||||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
|
@ -18,15 +18,15 @@ non_peer_reviewed as (
|
||||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
|
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
|
||||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||||
select /*+ COALESCE(100) */ distinct *
|
select distinct *
|
||||||
from (
|
from (
|
||||||
select peer_reviewed.* from peer_reviewed
|
select peer_reviewed.* from peer_reviewed
|
||||||
union all
|
union all
|
||||||
select non_peer_reviewed.* from non_peer_reviewed
|
select non_peer_reviewed.* from non_peer_reviewed
|
||||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||||
where peer_reviewed.id is null) pr; /*EOS*/
|
where peer_reviewed.id is null) pr;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge;
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
|
||||||
with peer_reviewed as (
|
with peer_reviewed as (
|
||||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
|
@ -36,15 +36,15 @@ non_peer_reviewed as (
|
||||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
|
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
|
||||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||||
select /*+ COALESCE(100) */ distinct *
|
select distinct *
|
||||||
from (
|
from (
|
||||||
select peer_reviewed.* from peer_reviewed
|
select peer_reviewed.* from peer_reviewed
|
||||||
union all
|
union all
|
||||||
select non_peer_reviewed.* from non_peer_reviewed
|
select non_peer_reviewed.* from non_peer_reviewed
|
||||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||||
where peer_reviewed.id is null) pr; /*EOS*/
|
where peer_reviewed.id is null) pr;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge;
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
|
||||||
with peer_reviewed as (
|
with peer_reviewed as (
|
||||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
|
@ -54,15 +54,15 @@ non_peer_reviewed as (
|
||||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
|
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
|
||||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||||
select /*+ COALESCE(100) */ distinct *
|
select distinct *
|
||||||
from (
|
from (
|
||||||
select peer_reviewed.* from peer_reviewed
|
select peer_reviewed.* from peer_reviewed
|
||||||
union all
|
union all
|
||||||
select non_peer_reviewed.* from non_peer_reviewed
|
select non_peer_reviewed.* from non_peer_reviewed
|
||||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||||
where peer_reviewed.id is null) pr; /*EOS*/
|
where peer_reviewed.id is null) pr;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge;
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
|
||||||
with peer_reviewed as (
|
with peer_reviewed as (
|
||||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
|
@ -72,13 +72,13 @@ non_peer_reviewed as (
|
||||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||||
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
||||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||||
select /*+ COALESCE(100) */ distinct *
|
select distinct *
|
||||||
from (
|
from (
|
||||||
select peer_reviewed.* from peer_reviewed
|
select peer_reviewed.* from peer_reviewed
|
||||||
union all
|
union all
|
||||||
select non_peer_reviewed.* from non_peer_reviewed
|
select non_peer_reviewed.* from non_peer_reviewed
|
||||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||||
where peer_reviewed.id is null) pr; /*EOS*/
|
where peer_reviewed.id is null) pr;
|
||||||
|
|
||||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
|
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
|
||||||
select * from ${stats_db_name}.publication_refereed
|
select * from ${stats_db_name}.publication_refereed
|
||||||
|
@ -87,23 +87,23 @@ select * from ${stats_db_name}.dataset_refereed
|
||||||
union all
|
union all
|
||||||
select * from ${stats_db_name}.software_refereed
|
select * from ${stats_db_name}.software_refereed
|
||||||
union all
|
union all
|
||||||
select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/
|
select * from ${stats_db_name}.otherresearchproduct_refereed;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge;
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
|
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
|
select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
|
||||||
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
|
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
|
||||||
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
|
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
|
||||||
where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/
|
where measures_ids.id!='views' and measures_ids.id!='downloads';
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge;
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
|
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
|
select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
|
||||||
cast(rel.properties[0].value as double) apc_amount,
|
cast(rel.properties[0].value as double) apc_amount,
|
||||||
rel.properties[1].value apc_currency
|
rel.properties[1].value apc_currency
|
||||||
from ${openaire_db_name}.relation rel
|
from ${openaire_db_name}.relation rel
|
||||||
join ${openaire_db_name}.organization o on o.id=rel.source
|
join ${openaire_db_name}.organization o on o.id=rel.source
|
||||||
join ${openaire_db_name}.result r on r.id=rel.target
|
join ${openaire_db_name}.result r on r.id=rel.target
|
||||||
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/
|
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
|
||||||
|
|
|
@ -1,27 +1,27 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
-------------------------------------------
|
-------------------------------------------
|
||||||
--- Extra tables, mostly used by indicators
|
--- Extra tables, mostly used by indicators
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge;
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
|
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ r.id, count(distinct p.id) as count
|
select r.id, count(distinct p.id) as count
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
|
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||||
left outer join ${stats_db_name}.project p on p.id=rp.project
|
left outer join ${stats_db_name}.project p on p.id=rp.project
|
||||||
group by r.id; /*EOS*/
|
group by r.id;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge;
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
|
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ r.id, count(distinct p.funder) as count
|
select r.id, count(distinct p.funder) as count
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
|
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||||
left outer join ${stats_db_name}.project p on p.id=rp.project
|
left outer join ${stats_db_name}.project p on p.id=rp.project
|
||||||
group by r.id; /*EOS*/
|
group by r.id;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge;
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
|
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
|
||||||
with rcount as (
|
with rcount as (
|
||||||
|
@ -30,39 +30,39 @@ with rcount as (
|
||||||
left outer join ${stats_db_name}.result_projects rp on rp.project=p.id
|
left outer join ${stats_db_name}.result_projects rp on rp.project=p.id
|
||||||
left outer join ${stats_db_name}.result r on r.id=rp.id
|
left outer join ${stats_db_name}.result r on r.id=rp.id
|
||||||
group by r.type, p.id )
|
group by r.type, p.id )
|
||||||
select /*+ COALESCE(100) */ rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
|
select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
|
||||||
sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets,
|
sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets,
|
||||||
sum(case when rcount.type='software' then rcount.count else 0 end) as software,
|
sum(case when rcount.type='software' then rcount.count else 0 end) as software,
|
||||||
sum(case when rcount.type='other' then rcount.count else 0 end) as other
|
sum(case when rcount.type='other' then rcount.count else 0 end) as other
|
||||||
from rcount
|
from rcount
|
||||||
group by rcount.pid; /*EOS*/
|
group by rcount.pid;
|
||||||
|
|
||||||
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/
|
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
|
||||||
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/
|
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
|
||||||
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/
|
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
|
||||||
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/
|
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
|
||||||
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/
|
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
|
||||||
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/
|
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
|
||||||
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/
|
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge;
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.result_instance stored as parquet as
|
create table if not exists ${stats_db_name}.result_instance stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct r.*
|
select distinct r.*
|
||||||
from (
|
from (
|
||||||
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
||||||
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
|
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
|
||||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
|
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
|
||||||
join ${stats_db_name}.result res on res.id=r.id; /*EOS*/
|
join ${stats_db_name}.result res on res.id=r.id;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge;
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
|
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ distinct r.id, r.amount, r.currency
|
select distinct r.id, r.amount, r.currency
|
||||||
from (
|
from (
|
||||||
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
|
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
|
||||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
|
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
|
||||||
join ${stats_db_name}.result res on res.id=r.id
|
join ${stats_db_name}.result res on res.id=r.id
|
||||||
where r.amount is not null; /*EOS*/
|
where r.amount is not null;
|
||||||
|
|
||||||
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/
|
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
-- Sprint 1 ----
|
-- Sprint 1 ----
|
||||||
drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(green_oa, 0) as green_oa
|
select distinct p.id, coalesce(green_oa, 0) as green_oa
|
||||||
from ${stats_db_name}.publication p
|
from ${stats_db_name}.publication p
|
||||||
left outer join (
|
left outer join (
|
||||||
select p.id, 1 as green_oa
|
select p.id, 1 as green_oa
|
||||||
|
@ -12,7 +12,7 @@ left outer join (
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(grey_lit, 0) as grey_lit
|
select distinct p.id, coalesce(grey_lit, 0) as grey_lit
|
||||||
from ${stats_db_name}.publication p
|
from ${stats_db_name}.publication p
|
||||||
left outer join (
|
left outer join (
|
||||||
select p.id, 1 as grey_lit
|
select p.id, 1 as grey_lit
|
||||||
|
@ -23,7 +23,7 @@ left outer join (
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
|
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
|
||||||
from ${stats_db_name}.publication p
|
from ${stats_db_name}.publication p
|
||||||
left outer join (
|
left outer join (
|
||||||
select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri
|
select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri
|
||||||
|
@ -33,7 +33,7 @@ left outer join (
|
||||||
-- Sprint 2 ----
|
-- Sprint 2 ----
|
||||||
drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as
|
create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
|
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join (
|
left outer join (
|
||||||
select r.id, license.type as lic from ${stats_db_name}.result r
|
select r.id, license.type as lic from ${stats_db_name}.result r
|
||||||
|
@ -42,7 +42,7 @@ left outer join (
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as
|
create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
|
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join (
|
left outer join (
|
||||||
select r.id, lower(parse_url(license.type, "HOST")) as lic_host
|
select r.id, lower(parse_url(license.type, "HOST")) as lic_host
|
||||||
|
@ -52,12 +52,12 @@ left outer join (
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
|
select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
|
||||||
from ${stats_db_name}.publication; /*EOS*/
|
from ${stats_db_name}.publication; /*EOS*/
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as
|
create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(has_orcid, 0) as has_orcid
|
select distinct r.id, coalesce(has_orcid, 0) as has_orcid
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join (
|
left outer join (
|
||||||
select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/
|
select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/
|
||||||
|
@ -66,7 +66,7 @@ left outer join (
|
||||||
---- Sprint 3 ----
|
---- Sprint 3 ----
|
||||||
drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as
|
create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct r.result as id, coalesce(fundref, 0) as fundref
|
select distinct r.result as id, coalesce(fundref, 0) as fundref
|
||||||
from ${stats_db_name}.project_results r
|
from ${stats_db_name}.project_results r
|
||||||
left outer join (
|
left outer join (
|
||||||
select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/
|
select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/
|
||||||
|
@ -77,7 +77,7 @@ create table if not exists ${stats_db_name}.indi_result_org_collab stored as par
|
||||||
SELECT ro.organization organization, ro.id, o.name
|
SELECT ro.organization organization, ro.id, o.name
|
||||||
from ${stats_db_name}.result_organization ro
|
from ${stats_db_name}.result_organization ro
|
||||||
join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null)
|
join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null)
|
||||||
select /*+ COALESCE(100) */ o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
|
select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
|
||||||
from tmp as o1
|
from tmp as o1
|
||||||
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
|
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
|
||||||
group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/
|
group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/
|
||||||
|
@ -89,7 +89,7 @@ create table if not exists ${stats_db_name}.indi_result_org_country_collab store
|
||||||
from ${stats_db_name}.result_organization ro
|
from ${stats_db_name}.result_organization ro
|
||||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||||
where country <> 'UNKNOWN' and o.name is not null)
|
where country <> 'UNKNOWN' and o.name is not null)
|
||||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
|
select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
|
||||||
from tmp as o1 join tmp as o2 on o1.id=o2.id
|
from tmp as o1 join tmp as o2 on o1.id=o2.id
|
||||||
where o1.id=o2.id and o1.country!=o2.country
|
where o1.id=o2.id and o1.country!=o2.country
|
||||||
group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/
|
group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/
|
||||||
|
@ -100,7 +100,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org stored as pa
|
||||||
select o.id organization, o.name, ro.project as project
|
select o.id organization, o.name, ro.project as project
|
||||||
from ${stats_db_name}.organization o
|
from ${stats_db_name}.organization o
|
||||||
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null)
|
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null)
|
||||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
|
select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
|
||||||
from tmp as o1
|
from tmp as o1
|
||||||
join tmp as o2 on o1.project=o2.project
|
join tmp as o2 on o1.project=o2.project
|
||||||
where o1.organization<>o2.organization and o1.name<>o2.name
|
where o1.organization<>o2.organization and o1.name<>o2.name
|
||||||
|
@ -112,7 +112,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org_country stor
|
||||||
select o.id organization, o.name, o.country , ro.project as project
|
select o.id organization, o.name, o.country , ro.project as project
|
||||||
from ${stats_db_name}.organization o
|
from ${stats_db_name}.organization o
|
||||||
join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null)
|
join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null)
|
||||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
|
select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
|
||||||
from tmp as o1
|
from tmp as o1
|
||||||
join tmp as o2 on o1.project=o2.project
|
join tmp as o2 on o1.project=o2.project
|
||||||
where o1.organization<>o2.organization and o1.country<>o2.country
|
where o1.organization<>o2.organization and o1.country<>o2.country
|
||||||
|
@ -124,7 +124,7 @@ create table if not exists ${stats_db_name}.indi_funder_country_collab stored as
|
||||||
join ${stats_db_name}.organization o on o.id=op.id
|
join ${stats_db_name}.organization o on o.id=op.id
|
||||||
join ${stats_db_name}.project p on p.id=op.project
|
join ${stats_db_name}.project p on p.id=op.project
|
||||||
where country <> 'UNKNOWN')
|
where country <> 'UNKNOWN')
|
||||||
select /*+ COALESCE(100) */ f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
|
select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
|
||||||
from tmp as f1
|
from tmp as f1
|
||||||
join tmp as f2 on f1.project=f2.project
|
join tmp as f2 on f1.project=f2.project
|
||||||
where f1.country<>f2.country
|
where f1.country<>f2.country
|
||||||
|
@ -136,7 +136,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
|
||||||
select distinct country, ro.id as result from ${stats_db_name}.organization o
|
select distinct country, ro.id as result from ${stats_db_name}.organization o
|
||||||
join ${stats_db_name}.result_organization ro on o.id=ro.organization
|
join ${stats_db_name}.result_organization ro on o.id=ro.organization
|
||||||
where country <> 'UNKNOWN' and o.name is not null)
|
where country <> 'UNKNOWN' and o.name is not null)
|
||||||
select /*+ COALESCE(100) */ o1.country country1, o2.country country2, count(o1.result) as collaborations
|
select o1.country country1, o2.country country2, count(o1.result) as collaborations
|
||||||
from tmp as o1
|
from tmp as o1
|
||||||
join tmp as o2 on o1.result=o2.result
|
join tmp as o2 on o1.result=o2.result
|
||||||
where o1.country<>o2.country
|
where o1.country<>o2.country
|
||||||
|
@ -146,7 +146,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
|
||||||
---- Sprint 4 ----
|
---- Sprint 4 ----
|
||||||
drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
|
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
|
||||||
from ${stats_db_name}.publication_datasources pd
|
from ${stats_db_name}.publication_datasources pd
|
||||||
left outer join (
|
left outer join (
|
||||||
select pd.id, 1 as in_diamond_journal
|
select pd.id, 1 as in_diamond_journal
|
||||||
|
@ -157,7 +157,7 @@ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet a
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct pd.id, coalesce(is_transformative, 0) as is_transformative
|
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
|
||||||
from ${stats_db_name}.publication pd
|
from ${stats_db_name}.publication pd
|
||||||
left outer join (
|
left outer join (
|
||||||
select pd.id, 1 as is_transformative
|
select pd.id, 1 as is_transformative
|
||||||
|
@ -168,7 +168,7 @@ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
|
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
|
||||||
from ${stats_db_name}.result_instance ri
|
from ${stats_db_name}.result_instance ri
|
||||||
left outer join (
|
left outer join (
|
||||||
select ri.id, 1 as pub_closed_other_open
|
select ri.id, 1 as pub_closed_other_open
|
||||||
|
@ -182,14 +182,14 @@ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as
|
||||||
---- Sprint 5 ----
|
---- Sprint 5 ----
|
||||||
drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as
|
create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as
|
||||||
select /*+ COALESCE(100) */ id, count(id) as number_of_copies
|
select id, count(id) as number_of_copies
|
||||||
from ${stats_db_name}.result_instance
|
from ${stats_db_name}.result_instance
|
||||||
group by id; /*EOS*/
|
group by id; /*EOS*/
|
||||||
|
|
||||||
---- Sprint 6 ----
|
---- Sprint 6 ----
|
||||||
drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as
|
||||||
SELECT /*+ COALESCE(100) */ result_id, sum(downloads) no_downloads
|
SELECT result_id, sum(downloads) no_downloads
|
||||||
from openaire_prod_usage_stats.usage_stats
|
from openaire_prod_usage_stats.usage_stats
|
||||||
join ${stats_db_name}.publication on result_id=id
|
join ${stats_db_name}.publication on result_id=id
|
||||||
where downloads>0
|
where downloads>0
|
||||||
|
@ -197,7 +197,7 @@ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as
|
||||||
SELECT /*+ COALESCE(100) */ result_id, repository_id, sum(downloads) no_downloads
|
SELECT result_id, repository_id, sum(downloads) no_downloads
|
||||||
from openaire_prod_usage_stats.usage_stats
|
from openaire_prod_usage_stats.usage_stats
|
||||||
join ${stats_db_name}.publication on result_id=id
|
join ${stats_db_name}.publication on result_id=id
|
||||||
where downloads>0
|
where downloads>0
|
||||||
|
@ -205,14 +205,14 @@ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as
|
||||||
SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
|
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
|
||||||
from openaire_prod_usage_stats.usage_stats us
|
from openaire_prod_usage_stats.usage_stats us
|
||||||
join ${stats_db_name}.publication on result_id=id where downloads>0
|
join ${stats_db_name}.publication on result_id=id where downloads>0
|
||||||
GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/
|
GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as
|
||||||
SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
|
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
|
||||||
from openaire_prod_usage_stats.usage_stats us
|
from openaire_prod_usage_stats.usage_stats us
|
||||||
join ${stats_db_name}.publication on result_id=id
|
join ${stats_db_name}.publication on result_id=id
|
||||||
where downloads>0
|
where downloads>0
|
||||||
|
@ -241,7 +241,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
|
||||||
UNION ALL
|
UNION ALL
|
||||||
select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo
|
select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo
|
||||||
)
|
)
|
||||||
SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
|
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
|
||||||
FROM ${stats_db_name}.publication pd
|
FROM ${stats_db_name}.publication pd
|
||||||
left outer join (
|
left outer join (
|
||||||
select pd.id, 1 as is_gold
|
select pd.id, 1 as is_gold
|
||||||
|
@ -272,7 +272,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
|
||||||
FROM ${stats_db_name}.datasource
|
FROM ${stats_db_name}.datasource
|
||||||
WHERE issn_online IS NOT NULL ) as issn
|
WHERE issn_online IS NOT NULL ) as issn
|
||||||
WHERE LENGTH(issn) > 7)
|
WHERE LENGTH(issn) > 7)
|
||||||
SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
|
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
|
||||||
FROM ${stats_db_name}.publication_datasources pd
|
FROM ${stats_db_name}.publication_datasources pd
|
||||||
LEFT OUTER JOIN (
|
LEFT OUTER JOIN (
|
||||||
SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd
|
SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd
|
||||||
|
@ -284,7 +284,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(is_hybrid, 0) is_hybrid
|
select distinct p.id, coalesce(is_hybrid, 0) is_hybrid
|
||||||
from ${stats_db_name}.publication p
|
from ${stats_db_name}.publication p
|
||||||
left outer join (
|
left outer join (
|
||||||
select p.id, 1 as is_hybrid
|
select p.id, 1 as is_hybrid
|
||||||
|
@ -313,7 +313,7 @@ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet
|
||||||
where cast(year as int)>2003
|
where cast(year as int)>2003
|
||||||
group by ro.organization)
|
group by ro.organization)
|
||||||
--return results_fair/all_results
|
--return results_fair/all_results
|
||||||
select /*+ COALESCE(100) */ allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||||
from allresults
|
from allresults
|
||||||
join result_fair on result_fair.organization=allresults.organization; /*EOS*/
|
join result_fair on result_fair.organization=allresults.organization; /*EOS*/
|
||||||
|
|
||||||
|
@ -336,7 +336,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
|
||||||
drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as
|
||||||
select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||||
from allresults ar
|
from allresults ar
|
||||||
join result_fair rf on rf.organization=ar.organization; /*EOS*/
|
join result_fair rf on rf.organization=ar.organization; /*EOS*/
|
||||||
|
|
||||||
|
@ -357,7 +357,7 @@ CREATE TEMPORARY VIEW allresults as select year, ro.organization, count(distinct
|
||||||
drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as
|
||||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||||
from allresults
|
from allresults
|
||||||
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/
|
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/
|
||||||
|
|
||||||
|
@ -381,7 +381,7 @@ CREATE TEMPORARY VIEW allresults as
|
||||||
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
|
||||||
select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||||
from allresults ar join result_fair rf
|
from allresults ar join result_fair rf
|
||||||
on rf.organization=ar.organization; /*EOS*/
|
on rf.organization=ar.organization; /*EOS*/
|
||||||
|
|
||||||
|
@ -404,7 +404,7 @@ CREATE TEMPORARY VIEW allresults as
|
||||||
drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as
|
||||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||||
from allresults
|
from allresults
|
||||||
join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/
|
join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/
|
||||||
|
|
||||||
|
@ -427,7 +427,7 @@ CREATE TEMPORARY VIEW allresults as
|
||||||
drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as
|
||||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
select cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||||
from allresults
|
from allresults
|
||||||
join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/
|
join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/
|
||||||
|
|
||||||
|
@ -450,7 +450,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
|
||||||
drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as
|
||||||
select /*+ COALESCE(100) */ allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||||
from allresults
|
from allresults
|
||||||
join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/
|
join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/
|
||||||
|
|
||||||
|
@ -516,7 +516,7 @@ select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsof
|
||||||
drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as
|
||||||
select /*+ COALESCE(100) */ allpubsshare.organization,
|
select allpubsshare.organization,
|
||||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||||
+(case when d is null then 0 else 1 end))
|
+(case when d is null then 0 else 1 end))
|
||||||
org_openess FROM allpubsshare
|
org_openess FROM allpubsshare
|
||||||
|
@ -593,7 +593,7 @@ select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/all
|
||||||
drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as
|
||||||
select /*+ COALESCE(100) */ cast(allpubsshare.year as int) year, allpubsshare.organization,
|
select cast(allpubsshare.year as int) year, allpubsshare.organization,
|
||||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||||
+(case when d is null then 0 else 1 end))
|
+(case when d is null then 0 else 1 end))
|
||||||
org_openess FROM allpubsshare
|
org_openess FROM allpubsshare
|
||||||
|
@ -617,7 +617,7 @@ DROP VIEW allsoftwaresshare; /*EOS*/
|
||||||
drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(has_preprint, 0) as has_preprint
|
select distinct p.id, coalesce(has_preprint, 0) as has_preprint
|
||||||
from ${stats_db_name}.publication_classifications p
|
from ${stats_db_name}.publication_classifications p
|
||||||
left outer join (
|
left outer join (
|
||||||
select p.id, 1 as has_preprint
|
select p.id, 1 as has_preprint
|
||||||
|
@ -627,7 +627,7 @@ from ${stats_db_name}.publication_classifications p
|
||||||
drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(is_subscription, 0) as is_subscription
|
select distinct p.id, coalesce(is_subscription, 0) as is_subscription
|
||||||
from ${stats_db_name}.publication p
|
from ${stats_db_name}.publication p
|
||||||
left outer join(
|
left outer join(
|
||||||
select p.id, 1 as is_subscription from ${stats_db_name}.publication p
|
select p.id, 1 as is_subscription from ${stats_db_name}.publication p
|
||||||
|
@ -640,7 +640,7 @@ from ${stats_db_name}.publication p
|
||||||
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
|
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
||||||
from ${stats_db_name}.result p
|
from ${stats_db_name}.result p
|
||||||
left outer join (
|
left outer join (
|
||||||
select p.id, 1 as result_with_pid
|
select p.id, 1 as result_with_pid
|
||||||
|
@ -654,7 +654,7 @@ group by rf.id; /*EOS*/
|
||||||
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
select distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
||||||
as is_interdisciplinary
|
as is_interdisciplinary
|
||||||
from pub_fos_totals p
|
from pub_fos_totals p
|
||||||
left outer join (
|
left outer join (
|
||||||
|
@ -666,7 +666,7 @@ drop view pub_fos_totals; /*EOS*/
|
||||||
drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/
|
||||||
|
|
||||||
create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as
|
create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
|
select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
|
||||||
from ${stats_db_name}.publication p
|
from ${stats_db_name}.publication p
|
||||||
left outer join (
|
left outer join (
|
||||||
select p.id, 1 as is_bronze_oa
|
select p.id, 1 as is_bronze_oa
|
||||||
|
@ -689,7 +689,7 @@ where p.end_year is NOT NULL and r.year is not null; /*EOS*/
|
||||||
drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as
|
create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as
|
||||||
select /*+ COALESCE(100) */ pry.project_id, pry.acronym, pry.result_id,
|
select pry.project_id, pry.acronym, pry.result_id,
|
||||||
coalesce(is_project_result_after, 0) as is_project_result_after
|
coalesce(is_project_result_after, 0) as is_project_result_after
|
||||||
from project_year_result_year pry
|
from project_year_result_year pry
|
||||||
left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after
|
left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after
|
||||||
|
@ -701,7 +701,7 @@ drop view project_year_result_year; /*EOS*/
|
||||||
drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as
|
create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
|
select distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
|
||||||
from ${stats_db_name}.funder f
|
from ${stats_db_name}.funder f
|
||||||
left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder
|
left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder
|
||||||
join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp
|
join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp
|
||||||
|
@ -722,7 +722,7 @@ create table if not exists ${stats_db_name}.indi_funder_fairness stored as parqu
|
||||||
join ${stats_db_name}.project p on p.id=rp.project
|
join ${stats_db_name}.project p on p.id=rp.project
|
||||||
where cast(year as int)>2003
|
where cast(year as int)>2003
|
||||||
group by p.funder)
|
group by p.funder)
|
||||||
select /*+ COALESCE(100) */ allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
|
select allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
|
||||||
from allresults
|
from allresults
|
||||||
join result_fair on result_fair.funder=allresults.funder; /*EOS*/
|
join result_fair on result_fair.funder=allresults.funder; /*EOS*/
|
||||||
|
|
||||||
|
@ -745,7 +745,7 @@ allresults as
|
||||||
join ${stats_db_name}.result r on r.id=rc.id
|
join ${stats_db_name}.result r on r.id=rc.id
|
||||||
where cast(year as int)>2003
|
where cast(year as int)>2003
|
||||||
group by rc.ri_initiative)
|
group by rc.ri_initiative)
|
||||||
select /*+ COALESCE(100) */ allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
|
select allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
|
||||||
from allresults
|
from allresults
|
||||||
join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/
|
join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/
|
||||||
|
|
||||||
|
@ -817,14 +817,16 @@ select software_oa.funder, software_oa.no_oasoftware/allsoftware.no_allsoftware
|
||||||
drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as
|
create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as
|
||||||
select /*+ COALESCE(100) */ allpubsshare.funder,
|
select allpubsshare.funder,
|
||||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||||
+(case when d is null then 0 else 1 end)) funder_openess
|
+(case when d is null then 0 else 1 end))
|
||||||
FROM allpubsshare
|
funder_openess FROM allpubsshare
|
||||||
left outer join (select funder,d from alldatasetssshare) tmp1
|
left outer join (select funder,d from
|
||||||
on tmp1.funder=allpubsshare.funder
|
alldatasetssshare) tmp1
|
||||||
left outer join (select funder,s from allsoftwaresshare) tmp2
|
on tmp1.funder=allpubsshare.funder
|
||||||
on tmp2.funder=allpubsshare.funder; /*EOS*/
|
left outer join (select funder,s from
|
||||||
|
allsoftwaresshare) tmp2
|
||||||
|
on tmp2.funder=allpubsshare.funder; /*EOS*/
|
||||||
|
|
||||||
DROP VIEW pubs_oa; /*EOS*/
|
DROP VIEW pubs_oa; /*EOS*/
|
||||||
DROP VIEW datasets_oa; /*EOS*/
|
DROP VIEW datasets_oa; /*EOS*/
|
||||||
|
@ -903,7 +905,7 @@ select software_oa.ri_initiative, software_oa.no_oasoftware/allsoftware.no_allso
|
||||||
drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as
|
create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as
|
||||||
select /*+ COALESCE(100) */ allpubsshare.ri_initiative,
|
select allpubsshare.ri_initiative,
|
||||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||||
+(case when d is null then 0 else 1 end))
|
+(case when d is null then 0 else 1 end))
|
||||||
ris_openess FROM allpubsshare
|
ris_openess FROM allpubsshare
|
||||||
|
@ -941,7 +943,7 @@ with result_findable as
|
||||||
join ${stats_db_name}.project p on p.id=rp.project
|
join ${stats_db_name}.project p on p.id=rp.project
|
||||||
where cast(year as int)>2003
|
where cast(year as int)>2003
|
||||||
group by p.funder)
|
group by p.funder)
|
||||||
select /*+ COALESCE(100) */ allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
|
select allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
|
||||||
from allresults
|
from allresults
|
||||||
join result_findable on result_findable.funder=allresults.funder; /*EOS*/
|
join result_findable on result_findable.funder=allresults.funder; /*EOS*/
|
||||||
|
|
||||||
|
@ -950,43 +952,41 @@ drop table if exists ${stats_db_name}.indi_ris_findable purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as
|
create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as
|
||||||
with result_contexts as
|
with result_contexts as
|
||||||
(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc
|
(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc
|
||||||
join ${stats_db_name}.concept on concept.id=rc.concept
|
join ${stats_db_name}.concept on concept.id=rc.concept
|
||||||
join ${stats_db_name}.category on category.id=concept.category
|
join ${stats_db_name}.category on category.id=concept.category
|
||||||
join ${stats_db_name}.context on context.id=category.context),
|
join ${stats_db_name}.context on context.id=category.context),
|
||||||
result_findable as
|
result_findable as
|
||||||
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc
|
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc
|
||||||
join ${stats_db_name}.result r on r.id=rc.id
|
join ${stats_db_name}.result r on r.id=rc.id
|
||||||
join ${stats_db_name}.result_pids rp on rp.id=r.id
|
join ${stats_db_name}.result_pids rp on rp.id=r.id
|
||||||
where cast(r.year as int)>2003
|
where cast(r.year as int)>2003
|
||||||
group by rc.ri_initiative),
|
group by rc.ri_initiative),
|
||||||
allresults as
|
allresults as
|
||||||
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc
|
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc
|
||||||
join ${stats_db_name}.result r on r.id=rc.id
|
join ${stats_db_name}.result r on r.id=rc.id
|
||||||
where cast(r.year as int)>2003
|
where cast(r.year as int)>2003
|
||||||
group by rc.ri_initiative)
|
group by rc.ri_initiative)
|
||||||
select /*+ COALESCE(100) */ allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
|
select allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
|
||||||
from allresults
|
from allresults
|
||||||
join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/
|
join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/
|
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as
|
||||||
with org_names_pids as
|
with org_names_pids as
|
||||||
(select org.id,name, pid from ${stats_db_name}.organization org
|
(select org.id,name, pid from ${stats_db_name}.organization org
|
||||||
join ${stats_db_name}.organization_pids op on org.id=op.id),
|
join ${stats_db_name}.organization_pids op on org.id=op.id),
|
||||||
publicly_funded_orgs as
|
publicly_funded_orgs as
|
||||||
(select distinct name from
|
(select distinct name from
|
||||||
(select pf.name from stats_ext.insitutions_for_publicly_funded pf
|
(select pf.name from stats_ext.insitutions_for_publicly_funded pf
|
||||||
join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government'
|
join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government'
|
||||||
union all
|
union all
|
||||||
select pf.name from stats_ext.insitutions_for_publicly_funded pf
|
select pf.name from stats_ext.insitutions_for_publicly_funded pf
|
||||||
join ${stats_db_name}.project p on p.funder=pf.name
|
join ${stats_db_name}.project p on p.funder=pf.name
|
||||||
union all
|
union all
|
||||||
select op.name from stats_ext.insitutions_for_publicly_funded pf
|
select op.name from stats_ext.insitutions_for_publicly_funded pf
|
||||||
join org_names_pids op on (op.name=pf.name or op.pid=pf.ror)
|
join org_names_pids op on (op.name=pf.name or op.pid=pf.ror)
|
||||||
and pf.publicly_funded='yes') foo)
|
and pf.publicly_funded='yes') foo)
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
|
select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
|
||||||
from ${stats_db_name}.publication p
|
from ${stats_db_name}.publication p
|
||||||
left outer join (
|
left outer join (
|
||||||
select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro
|
select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro
|
||||||
|
@ -995,7 +995,7 @@ join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; /*EOS*/
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/
|
||||||
create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as
|
create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(green_with_license, 0) as green_with_license
|
select distinct p.id, coalesce(green_with_license, 0) as green_with_license
|
||||||
from ${stats_db_name}.publication p
|
from ${stats_db_name}.publication p
|
||||||
left outer join (
|
left outer join (
|
||||||
select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p
|
select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p
|
||||||
|
@ -1006,7 +1006,7 @@ left outer join (
|
||||||
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
|
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
|
||||||
|
|
||||||
create table ${stats_db_name}.result_country stored as parquet as
|
create table ${stats_db_name}.result_country stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct id, country
|
select distinct id, country
|
||||||
from (
|
from (
|
||||||
select ro.id, o.country
|
select ro.id, o.country
|
||||||
from ${stats_db_name}.result_organization ro
|
from ${stats_db_name}.result_organization ro
|
||||||
|
@ -1021,7 +1021,7 @@ where rc.country is not null; /*EOS*/
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
|
||||||
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as
|
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as
|
||||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_with_license,0) as oa_with_license
|
select distinct r.id, coalesce(oa_with_license,0) as oa_with_license
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r
|
left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r
|
||||||
join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/
|
join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/
|
||||||
|
@ -1029,9 +1029,9 @@ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open
|
||||||
drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/
|
||||||
create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as
|
create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as
|
||||||
with without_license as
|
with without_license as
|
||||||
(select distinct id from ${stats_db_name}.indi_result_oa_with_license
|
(select distinct id from ${stats_db_name}.indi_result_oa_with_license
|
||||||
where oa_with_license=0)
|
where oa_with_license=0)
|
||||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_without_license,0) as oa_without_license
|
select distinct r.id, coalesce(oa_without_license,0) as oa_without_license
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join (select distinct r.id, 1 as oa_without_license
|
left outer join (select distinct r.id, 1 as oa_without_license
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
|
@ -1042,7 +1042,7 @@ drop table if exists ${stats_db_name}.indi_result_under_transformative purge; /*
|
||||||
create table ${stats_db_name}.indi_result_under_transformative stored as parquet as
|
create table ${stats_db_name}.indi_result_under_transformative stored as parquet as
|
||||||
with transformative_dois as (
|
with transformative_dois as (
|
||||||
select distinct doi from stats_ext.transformative_facts)
|
select distinct doi from stats_ext.transformative_facts)
|
||||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(under_transformative,0) as under_transformative
|
select distinct r.id, coalesce(under_transformative,0) as under_transformative
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join (
|
left outer join (
|
||||||
select distinct rp.id, 1 as under_transformative
|
select distinct rp.id, 1 as under_transformative
|
||||||
|
|
|
@ -1,30 +1,30 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
-- Shortcuts for various definitions in stats db ---
|
-- Shortcuts for various definitions in stats db ---
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
-- Peer reviewed:
|
-- Peer reviewed:
|
||||||
drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/
|
drop table if exists ${stats_db_name}.result_peerreviewed purge;
|
||||||
|
|
||||||
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
|
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
|
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
|
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
|
||||||
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/
|
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id;
|
||||||
|
|
||||||
-- Green OA:
|
-- Green OA:
|
||||||
drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/
|
drop table if exists ${stats_db_name}.result_greenoa purge;
|
||||||
|
|
||||||
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
|
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ r.id, case when green.green_oa=1 then true else false end as green
|
select r.id, case when green.green_oa=1 then true else false end as green
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/
|
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id;
|
||||||
|
|
||||||
-- GOLD OA:
|
-- GOLD OA:
|
||||||
drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/
|
drop table if exists ${stats_db_name}.result_gold purge;
|
||||||
|
|
||||||
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
|
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ r.id, case when gold.is_gold=1 then true else false end as gold
|
select r.id, case when gold.is_gold=1 then true else false end as gold
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/
|
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;
|
||||||
|
|
|
@ -1,26 +1,58 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
-- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold,
|
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||||
-- peer reviewed)
|
-- peer reviewed)
|
||||||
|
drop table if exists ${stats_db_name}.result_tmp;
|
||||||
|
|
||||||
drop view if exists ${stats_db_name}.result; /*EOS*/
|
CREATE TABLE ${stats_db_name}.result_tmp (
|
||||||
drop table if exists ${stats_db_name}.result; /*EOS*/
|
id STRING,
|
||||||
|
title STRING,
|
||||||
|
publisher STRING,
|
||||||
|
journal STRING,
|
||||||
|
`date` STRING,
|
||||||
|
`year` INT,
|
||||||
|
bestlicence STRING,
|
||||||
|
access_mode STRING,
|
||||||
|
embargo_end_date STRING,
|
||||||
|
delayed BOOLEAN,
|
||||||
|
authors INT,
|
||||||
|
source STRING,
|
||||||
|
abstract BOOLEAN,
|
||||||
|
type STRING ,
|
||||||
|
peer_reviewed BOOLEAN,
|
||||||
|
green BOOLEAN,
|
||||||
|
gold BOOLEAN)
|
||||||
|
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.result stored as parquet as
|
insert into ${stats_db_name}.result_tmp
|
||||||
SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
FROM (
|
FROM ${stats_db_name}.publication r
|
||||||
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
|
||||||
FROM ${stats_db_name}.publication)
|
|
||||||
UNION ALL
|
|
||||||
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
|
||||||
FROM ${stats_db_name}.dataset)
|
|
||||||
UNION ALL
|
|
||||||
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
|
||||||
FROM ${stats_db_name}.software)
|
|
||||||
UNION ALL
|
|
||||||
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
|
||||||
FROM ${stats_db_name}.otherresearchproduct)
|
|
||||||
) r
|
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||||
|
|
||||||
|
insert into ${stats_db_name}.result_tmp
|
||||||
|
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
|
FROM ${stats_db_name}.dataset r
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||||
|
|
||||||
|
insert into ${stats_db_name}.result_tmp
|
||||||
|
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
|
FROM ${stats_db_name}.software r
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||||
|
|
||||||
|
insert into ${stats_db_name}.result_tmp
|
||||||
|
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||||
|
FROM ${stats_db_name}.otherresearchproduct r
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||||
|
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||||
|
|
||||||
|
drop table if exists ${stats_db_name}.result;
|
||||||
|
drop view if exists ${stats_db_name}.result;
|
||||||
|
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
||||||
|
drop table ${stats_db_name}.result_tmp;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
|
@ -7,65 +7,65 @@ set mapred.job.queue.name=analytics; /*EOS*/
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
|
|
||||||
-- Publication temporary table
|
-- Publication temporary table
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge;
|
||||||
|
CREATE TABLE ${stats_db_name}.publication_tmp
|
||||||
|
(
|
||||||
|
id STRING,
|
||||||
|
title STRING,
|
||||||
|
publisher STRING,
|
||||||
|
journal STRING,
|
||||||
|
date STRING,
|
||||||
|
year STRING,
|
||||||
|
bestlicence STRING,
|
||||||
|
embargo_end_date STRING,
|
||||||
|
delayed BOOLEAN,
|
||||||
|
authors INT,
|
||||||
|
source STRING,
|
||||||
|
abstract BOOLEAN,
|
||||||
|
type STRING
|
||||||
|
)
|
||||||
|
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true');
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication stored as parquet as
|
INSERT INTO ${stats_db_name}.publication_tmp
|
||||||
with pub_pr as (
|
SELECT substr(p.id, 4) as id,
|
||||||
select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
p.title[0].value as title,
|
||||||
from ${openaire_db_name}.publication pub
|
p.publisher.value as publisher,
|
||||||
join ${openaire_db_name}.relation rel
|
p.journal.name as journal,
|
||||||
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id
|
p.dateofacceptance.value as date,
|
||||||
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
date_format(p.dateofacceptance.value, 'yyyy') as year,
|
||||||
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
p.bestaccessright.classname as bestlicence,
|
||||||
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false
|
p.embargoenddate.value as embargo_end_date,
|
||||||
),
|
false as delayed,
|
||||||
pub_delayed as (
|
size(p.author) as authors,
|
||||||
select pub_id, max(delayed) as delayed
|
concat_ws('\u003B', p.source.value) as source,
|
||||||
from pub_pr
|
case when size(p.description) > 0 then true else false end as abstract,
|
||||||
group by pub_id
|
'publication' as type
|
||||||
)
|
from ${openaire_db_name}.publication p
|
||||||
select /*+ COALESCE(100) */
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||||
substr(pub.id, 4) as id,
|
|
||||||
pub.title[0].value as title,
|
|
||||||
pub.publisher.value as publisher,
|
|
||||||
pub.journal.name as journal,
|
|
||||||
pub.dateofacceptance.value as date,
|
|
||||||
date_format(pub.dateofacceptance.value, 'yyyy') as year,
|
|
||||||
pub.bestaccessright.classname as bestlicence,
|
|
||||||
pub.embargoenddate.value as embargo_end_date,
|
|
||||||
coalesce(pub_delayed.delayed, false) as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects.
|
|
||||||
size(pub.author) as authors,
|
|
||||||
concat_ws('\u003B', pub.source.value) as source,
|
|
||||||
case when size(pub.description) > 0 then true else false end as abstract,
|
|
||||||
'publication' as type
|
|
||||||
from ${openaire_db_name}.publication pub
|
|
||||||
left outer join pub_delayed on pub.id=pub_delayed.pub_id
|
|
||||||
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/
|
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge;
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
|
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type
|
SELECT substr(p.id, 4) as id, instancetype.classname as type
|
||||||
from ${openaire_db_name}.publication p
|
from ${openaire_db_name}.publication p
|
||||||
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
|
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
|
||||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
|
SELECT substr(p.id, 4) as id, case
|
||||||
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
|
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
|
||||||
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
|
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
|
||||||
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
|
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
|
||||||
from ${openaire_db_name}.publication p
|
from ${openaire_db_name}.publication p
|
||||||
LATERAL VIEW explode(p.context) contexts as context
|
LATERAL VIEW explode(p.context) contexts as context
|
||||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
|
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
|
||||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||||
FROM (
|
FROM (
|
||||||
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
|
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
|
||||||
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
|
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
|
||||||
|
@ -73,44 +73,44 @@ FROM (
|
||||||
LEFT OUTER JOIN (
|
LEFT OUTER JOIN (
|
||||||
SELECT substr(d.id, 4) id
|
SELECT substr(d.id, 4) id
|
||||||
from ${openaire_db_name}.datasource d
|
from ${openaire_db_name}.datasource d
|
||||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
|
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
|
||||||
select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language
|
select substr(p.id, 4) as id, p.language.classname as language
|
||||||
FROM ${openaire_db_name}.publication p
|
FROM ${openaire_db_name}.publication p
|
||||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
|
SELECT substr(p.id, 4) AS id, oids.ids AS oid
|
||||||
FROM ${openaire_db_name}.publication p
|
FROM ${openaire_db_name}.publication p
|
||||||
LATERAL VIEW explode(p.originalid) oids AS ids
|
LATERAL VIEW explode(p.originalid) oids AS ids
|
||||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
|
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
|
||||||
FROM ${openaire_db_name}.publication p
|
FROM ${openaire_db_name}.publication p
|
||||||
LATERAL VIEW explode(p.pid) pids AS ppid
|
LATERAL VIEW explode(p.pid) pids AS ppid
|
||||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
|
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
|
||||||
select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
|
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
|
||||||
FROM ${openaire_db_name}.publication p
|
FROM ${openaire_db_name}.publication p
|
||||||
LATERAL VIEW explode(p.subject) subjects AS subject
|
LATERAL VIEW explode(p.subject) subjects AS subject
|
||||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/
|
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
|
||||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||||
FROM ${openaire_db_name}.publication p
|
FROM ${openaire_db_name}.publication p
|
||||||
lateral view explode(p.extrainfo) citations AS citation
|
lateral view explode(p.extrainfo) citations AS citation
|
||||||
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
|
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
|
||||||
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
|
||||||
|
|
||||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
set mapred.job.queue.name=analytics; /*EOS*/
|
|
||||||
|
|
||||||
drop database if exists TARGET cascade;
|
drop database if exists TARGET cascade;
|
||||||
create database if not exists TARGET;
|
create database if not exists TARGET;
|
||||||
|
|
||||||
|
@ -83,17 +81,11 @@ create table TARGET.result stored as parquet as
|
||||||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
|
||||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
|
||||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
|
||||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
|
||||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
|
||||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
|
||||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
|
||||||
) )) foo;
|
) )) foo;
|
||||||
|
|
||||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||||
|
@ -264,6 +256,7 @@ create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * f
|
||||||
|
|
||||||
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
|
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
|
|
||||||
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
set mapred.job.queue.name=analytics;
|
|
||||||
|
|
||||||
drop database if exists TARGET cascade;
|
drop database if exists TARGET cascade;
|
||||||
create database if not exists TARGET;
|
create database if not exists TARGET;
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue