Compare commits
477 Commits
mergeutils
...
main
Author | SHA1 | Date |
---|---|---|
|
b822b34abe | |
|
ea1841fbd2 | |
|
ee7deb3f60 | |
|
157cc8be87 | |
|
023099a921 | |
|
786c217085 | |
|
c858c02111 | |
|
8220e27110 | |
|
bc993d49c1 | |
|
1dc7458de2 | |
|
a7a54aab47 | |
|
eaa00a4199 | |
|
fb731b6d46 | |
|
b6da35e736 | |
|
3c9b8de892 | |
|
c67ef157d3 | |
|
c23f3031ed | |
|
8ec151aa3d | |
|
2636936162 | |
|
ef437a8cdf | |
|
86088ef26e | |
|
143c525343 | |
|
c371513d43 | |
|
71927ca818 | |
|
46018dc804 | |
|
3efd5b1308 | |
|
196fa55774 | |
|
50805e3fc1 | |
|
d39a1054b8 | |
|
576efc1857 | |
|
efc1632e16 | |
|
91b49366c6 | |
|
5e05385d35 | |
|
c4d9b5b9d2 | |
|
bf9a5e6314 | |
|
9d79ddb3dd | |
|
907aa28c6c | |
|
3955ceaa76 | |
|
128c143394 | |
|
5133993ee5 | |
|
5cf259a851 | |
|
e1828fc60e | |
|
81090ad593 | |
|
56920b447d | |
|
3feab5d92d | |
|
6be783caec | |
|
b703f94f09 | |
|
14f275ffaf | |
|
a428e7be7e | |
|
e3f28338c1 | |
|
8e45c5baa8 | |
|
db5e18c784 | |
|
fb266efbcb | |
|
d7daf54333 | |
|
f99eaa0376 | |
|
23312fcc1e | |
|
b864f0adcf | |
|
7a44869d87 | |
|
12ffde023f | |
|
15b54a345a | |
|
b48ed6e617 | |
|
68322843e2 | |
|
c7b32bbacc | |
|
c3fe59bc78 | |
|
1ea67eba82 | |
|
f9fb2fef6e | |
|
834461ba26 | |
|
032bcc8279 | |
|
92f018d196 | |
|
0611c81a2f | |
|
2b3b5fe9a1 | |
|
1efe7f7e39 | |
|
53e7bb4336 | |
|
f7d56e2ef2 | |
|
c1237ab39e | |
|
dc3a5858f7 | |
|
55f39f7850 | |
|
39a2afe8b5 | |
|
908ed9da7a | |
|
0cada3cc8f | |
|
90a4fb3547 | |
|
18aa323ee9 | |
|
c9a327bc50 | |
|
e234848af8 | |
|
b4e3389432 | |
|
711048ceed | |
|
70bf6ac415 | |
|
aa40e53c19 | |
|
ed052a3476 | |
|
26363060ed | |
|
0486227185 | |
|
a5d13d5d27 | |
|
e1a0fb8933 | |
|
69c5efbd8b | |
|
00ad21d814 | |
|
4355f64810 | |
|
66680b8b9a | |
|
dcf23b3d06 | |
|
f4068de298 | |
|
11bd89e132 | |
|
e96c2c1606 | |
|
50c18f7a0b | |
|
2615136efc | |
|
c08a58bba8 | |
|
e2937db385 | |
|
1878199dae | |
|
49af2e5740 | |
|
d2649a1429 | |
|
c3053ef34d | |
|
b5bcab13ec | |
|
425c9afc36 | |
|
93dd9cc639 | |
|
6189879643 | |
|
c57cff2d6d | |
|
7de114bda0 | |
|
eb4692e4ee | |
|
24a83fc24f | |
|
776c898c4b | |
|
5857fd38c1 | |
|
0656ab2838 | |
|
ab7f0855af | |
|
7a7e313157 | |
|
e5879b68c7 | |
|
3a027e97a7 | |
|
795e1b2629 | |
|
0c05abe50b | |
|
b72c3139e2 | |
|
b52a5a753b | |
|
c3fe9662b2 | |
|
57c678d904 | |
|
5ab8cd1794 | |
|
8fdd0244ad | |
|
18fdaaf548 | |
|
0c71c58df6 | |
|
43d05dbebb | |
|
e728a0897c | |
|
308ae580a9 | |
|
27d22bd8f9 | |
|
1f5aba12fa | |
|
43e123c624 | |
|
62a07b7add | |
|
96bddcc921 | |
|
b554c41cc7 | |
|
ac8747582c | |
|
0db7e4ae9a | |
|
8ac167e420 | |
|
0486cea4c4 | |
|
0625b9061f | |
|
9eeb9f5d32 | |
|
589bce3520 | |
|
013935c593 | |
|
a5ddd8dfbb | |
|
da333e9f4d | |
|
43fd1de681 | |
|
d070db4a32 | |
|
78b9d84e4a | |
|
43b454399f | |
|
d7da4f814b | |
|
14719dcd62 | |
|
41a42dde64 | |
|
843dc95340 | |
|
1e30454ee0 | |
|
2581672c11 | |
|
22745027c8 | |
|
abf0b69f29 | |
|
3cad4a415d | |
|
a0642bd190 | |
|
6132bd028e | |
|
519db1ddef | |
|
98dc042db5 | |
|
ef582948a7 | |
|
5142f462b5 | |
|
0794e0667b | |
|
4b1de076ac | |
|
c8a88b2187 | |
|
31e152d2bb | |
|
6f3e925cae | |
|
f0f6abf892 | |
|
26b97aa5ed | |
|
5add51f38c | |
|
b7c8acc563 | |
|
50fbebf186 | |
|
71d6e02886 | |
|
02c9a311c8 | |
|
42846d3b91 | |
|
4f0a044245 | |
|
4bb504e693 | |
|
cbe13a5c61 | |
|
9c9a9562ae | |
|
2c4440951f | |
|
b42bdd5fb3 | |
|
64cbd8abe9 | |
|
df6e3bda04 | |
|
573b081f1d | |
|
0eb0701b26 | |
|
0bf2a7a359 | |
|
24227ab598 | |
|
f01390702e | |
|
9ff44eed96 | |
|
cff6040424 | |
|
5592ccc37a | |
|
1fee4124e0 | |
|
73a67c0e4a | |
|
9e700a8b0d | |
|
75551ad4ec | |
|
94b931f7bd | |
|
3b209261f2 | |
|
d16c15da8d | |
|
036ba03fcd | |
|
09a6d17059 | |
|
d70793847d | |
|
730eaffc85 | |
|
bc8c97182d | |
|
92cc27e7eb | |
|
ef52128c55 | |
|
bfba71a95c | |
|
d72e7b7487 | |
|
ece56f0178 | |
|
414acd4ef4 | |
|
f6601ea7d1 | |
|
cd4c3c934d | |
|
4c40c96e30 | |
|
459167ac2f | |
|
07f634a46d | |
|
9521625a07 | |
|
58dbe71d39 | |
|
67a5aa0a38 | |
|
a3a570e9a0 | |
|
a99942f7cf | |
|
7f7083f53e | |
|
5281f010a5 | |
|
ee1fcb672b | |
|
c532831718 | |
|
d9b23a76c5 | |
|
841ca92246 | |
|
3bcfc40293 | |
|
f74c7e8689 | |
|
cbd4e5e4bb | |
|
3c79720342 | |
|
5ae4b4286c | |
|
316d585c8a | |
|
d34cef3f8d | |
|
3b837d38ce | |
|
f417515e43 | |
|
3067ea390d | |
|
ad0e9aa80c | |
|
9d94648f3b | |
|
c94d94035c | |
|
4374d7449e | |
|
43da7e1191 | |
|
07d009007b | |
|
071d044971 | |
|
b3ddbaed58 | |
|
8dae10b442 | |
|
83bb97be83 | |
|
6e1f383e4a | |
|
3f7d262a4e | |
|
1416f16b35 | |
|
ba1a0e7b4f | |
|
079085286c | |
|
8dd666aedd | |
|
f21133229a | |
|
d86b909db2 | |
|
08162902ab | |
|
dd4c27f4f3 | |
|
e8630a6d03 | |
|
f28c63d5ef | |
|
a512ead447 | |
|
1a8b609ed2 | |
|
bb10a22290 | |
|
4c8706efee | |
|
a418dacb47 | |
|
e9131f4e4a | |
|
4d0c59669b | |
|
3c8c88bdd3 | |
|
c548796463 | |
|
fd43b0e84a | |
|
e024718f73 | |
|
b920307bdd | |
|
8b2cbb611e | |
|
2e4cab026c | |
|
6b823100ae | |
|
75bfde043c | |
|
ffdd03d2f4 | |
|
40b98d8182 | |
|
106968adaa | |
|
a8a4db96f0 | |
|
37e36baf76 | |
|
9d39845d1f | |
|
1fbd4325f5 | |
|
1f1a6a5f5f | |
|
c4ec35b6cd | |
|
1726f49790 | |
|
1763d377ad | |
|
a0311e8a90 | |
|
8fb05888fd | |
|
2b626815ff | |
|
b177cd5a0a | |
|
671ba8a5a7 | |
|
5f1ed61c1f | |
|
8c03c41d5d | |
|
97454e9594 | |
|
7e34dde774 | |
|
24c3f92d87 | |
|
6ce9b600c1 | |
|
94089878fd | |
|
0097f4e64b | |
|
5c5a195e97 | |
|
70b78a40c7 | |
|
f206ff42d6 | |
|
34358afe75 | |
|
18bfff8af3 | |
|
69dac91659 | |
|
a9ede1e989 | |
|
242d647146 | |
|
af3ffad6c4 | |
|
ba5475ed4c | |
|
2c235e82ad | |
|
4ac06c9e37 | |
|
fa692b3629 | |
|
ef02648399 | |
|
d13bb534f0 | |
|
775c3f704a | |
|
9c3ab11d5b | |
|
423ef30676 | |
|
7152d47f84 | |
|
4853c19b5e | |
|
1f226d1dce | |
|
6186cdc2cc | |
|
d94b9bebf7 | |
|
19abba8fa7 | |
|
c2f179800c | |
|
2aed5a74be | |
|
4dc4862011 | |
|
dc80ab14d3 | |
|
77a2199837 | |
|
265180bfd2 | |
|
da0e9828f7 | |
|
599828ce35 | |
|
0bc74e2000 | |
|
7180911ded | |
|
da1727f93f | |
|
ccac6a7f75 | |
|
d512df8612 | |
|
59764145bb | |
|
9e8e39f78a | |
|
373a5f2c83 | |
|
8af129b0c7 | |
|
706092bc19 | |
|
aedd279f78 | |
|
8dcd028eed | |
|
4c9bc4c3a5 | |
|
8621377917 | |
|
ef2dd7a980 | |
|
55ea485783 | |
|
f3a85e224b | |
|
4ef0f2ec26 | |
|
288ec0b7d6 | |
|
5f32edd9bf | |
|
e10ce92fe5 | |
|
b93e1541aa | |
|
d029bf0b94 | |
|
009d7f312f | |
|
e4b27182d0 | |
|
758e662ab8 | |
|
485f9d18cb | |
|
a92206dab5 | |
|
d9506035e4 | |
|
118e72d7db | |
|
5befd93d7d | |
|
cae92cf811 | |
|
b64a5eb4a5 | |
|
654ffcba60 | |
|
db625e548d | |
|
04141fe259 | |
|
b88f009d9f | |
|
5ffe82ffd8 | |
|
1c173642f0 | |
|
382f46a8e4 | |
|
9fc8ebe98b | |
|
24c41806ac | |
|
087b5a7973 | |
|
688e3b7936 | |
|
2e465915b4 | |
|
4a4ca634f0 | |
|
c6a7602b3e | |
|
831055a1fc | |
|
cf3d0f4f83 | |
|
4f67225fbc | |
|
e093f04874 | |
|
c5a9f39141 | |
|
ecc05fe0f3 | |
|
42442ccd39 | |
|
9a9cc6a1dd | |
|
200098b683 | |
|
9c1df15071 | |
|
32870339f5 | |
|
7184cc0804 | |
|
7473093c84 | |
|
5f0906be60 | |
|
1b37516578 | |
|
c1e2460293 | |
|
3800361033 | |
|
699736addc | |
|
f86e19b282 | |
|
d40e20f437 | |
|
4953ae5649 | |
|
c60d3a2b46 | |
|
7becdaf31d | |
|
b713132db7 | |
|
11f2b470d3 | |
|
91c70b15a5 | |
|
f910b7379d | |
|
33bdad104e | |
|
5816ded93f | |
|
46972f8393 | |
|
da85ca697d | |
|
059e100ec7 | |
|
fc95a550c3 | |
|
6901ac91b1 | |
|
08c4588d47 | |
|
29d3da85f1 | |
|
33a2b1b5dc | |
|
c6df8327b3 | |
|
935aa367d8 | |
|
43aedbdfe5 | |
|
b6da9b67ff | |
|
a34c8b6f81 | |
|
122e75aa17 | |
|
cee7a45b1d | |
|
ed64618235 | |
|
8742934843 | |
|
13cc592f39 | |
|
af15b1e48d | |
|
eb45ba7af0 | |
|
a929dc5fee | |
|
5f9383b2d9 | |
|
b18bbca8af | |
|
55fa3b2a17 | |
|
80c5e0f637 | |
|
c01d528ab2 | |
|
e6d788d27a | |
|
930f118673 | |
|
b2c3071e72 | |
|
10ec074f79 | |
|
7225fe9cbe | |
|
869e129288 | |
|
840465958b | |
|
bdc8f993d0 | |
|
ec87149cb3 | |
|
b42e2c9df6 | |
|
1329aa8479 | |
|
a0ee1a8640 | |
|
96062164f9 | |
|
35bb7c423f | |
|
fd87571506 | |
|
c527112e33 | |
|
65209359bc | |
|
d72a64ded3 | |
|
3e8499ce47 | |
|
61aacb3271 | |
|
dbb567251a | |
|
c7e8ad853e | |
|
0849ebfd80 | |
|
281239249e | |
|
45fc5e12be | |
|
1c05aaaa2e | |
|
01d5ad6361 | |
|
d872d1cdd9 | |
|
ab0efecab4 | |
|
725c3c68d0 | |
|
300ae6221c | |
|
0ec2eaba35 | |
|
a387807d43 | |
|
2abe2bc137 | |
|
a07c876922 | |
|
cbd48bc645 |
|
@ -63,11 +63,13 @@
|
|||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-pace-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
|
|
|
@ -1,53 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import okhttp3.MediaType;
|
||||
import okhttp3.RequestBody;
|
||||
import okhttp3.internal.Util;
|
||||
import okio.BufferedSink;
|
||||
import okio.Okio;
|
||||
import okio.Source;
|
||||
|
||||
public class InputStreamRequestBody extends RequestBody {
|
||||
|
||||
private final InputStream inputStream;
|
||||
private final MediaType mediaType;
|
||||
private final long lenght;
|
||||
|
||||
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {
|
||||
|
||||
return new InputStreamRequestBody(inputStream, mediaType, len);
|
||||
}
|
||||
|
||||
private InputStreamRequestBody(InputStream inputStream, MediaType mediaType, long len) {
|
||||
this.inputStream = inputStream;
|
||||
this.mediaType = mediaType;
|
||||
this.lenght = len;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MediaType contentType() {
|
||||
return mediaType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long contentLength() {
|
||||
|
||||
return lenght;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(BufferedSink sink) throws IOException {
|
||||
Source source = null;
|
||||
try {
|
||||
source = Okio.source(inputStream);
|
||||
sink.writeAll(source);
|
||||
} finally {
|
||||
Util.closeQuietly(source);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api;
|
||||
|
||||
public class MissingConceptDoiException extends Throwable {
|
||||
public MissingConceptDoiException(String message) {
|
||||
super(message);
|
||||
}
|
||||
}
|
|
@ -1,365 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.IOException;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.apache.http.entity.ContentType;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
|
||||
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
|
||||
import okhttp3.*;
|
||||
|
||||
public class ZenodoAPIClient implements Serializable {
|
||||
|
||||
String urlString;
|
||||
String bucket;
|
||||
|
||||
String deposition_id;
|
||||
String access_token;
|
||||
|
||||
public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8");
|
||||
|
||||
private static final MediaType MEDIA_TYPE_ZIP = MediaType.parse("application/zip");
|
||||
|
||||
public String getUrlString() {
|
||||
return urlString;
|
||||
}
|
||||
|
||||
public void setUrlString(String urlString) {
|
||||
this.urlString = urlString;
|
||||
}
|
||||
|
||||
public String getBucket() {
|
||||
return bucket;
|
||||
}
|
||||
|
||||
public void setBucket(String bucket) {
|
||||
this.bucket = bucket;
|
||||
}
|
||||
|
||||
public void setDeposition_id(String deposition_id) {
|
||||
this.deposition_id = deposition_id;
|
||||
}
|
||||
|
||||
public ZenodoAPIClient(String urlString, String access_token) {
|
||||
|
||||
this.urlString = urlString;
|
||||
this.access_token = access_token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
|
||||
*
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
*/
|
||||
public int newDeposition() throws IOException {
|
||||
String json = "{}";
|
||||
|
||||
URL url = new URL(urlString);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setRequestMethod("POST");
|
||||
conn.setDoOutput(true);
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
byte[] input = json.getBytes("utf-8");
|
||||
os.write(input, 0, input.length);
|
||||
}
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
conn.disconnect();
|
||||
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
|
||||
this.bucket = newSubmission.getLinks().getBucket();
|
||||
this.deposition_id = newSubmission.getId();
|
||||
|
||||
return responseCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Upload files in Zenodo.
|
||||
*
|
||||
* @param is the inputStream for the file to upload
|
||||
* @param file_name the name of the file as it will appear on Zenodo
|
||||
* @return the response code
|
||||
*/
|
||||
public int uploadIS(InputStream is, String file_name) throws IOException {
|
||||
|
||||
URL url = new URL(bucket + "/" + file_name);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("PUT");
|
||||
|
||||
byte[] buf = new byte[8192];
|
||||
int length;
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
while ((length = is.read(buf)) != -1) {
|
||||
os.write(buf, 0, length);
|
||||
}
|
||||
|
||||
}
|
||||
int responseCode = conn.getResponseCode();
|
||||
if (!checkOKStatus(responseCode)) {
|
||||
throw new IOException("Unexpected code " + responseCode + getBody(conn));
|
||||
}
|
||||
|
||||
return responseCode;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private String getBody(HttpURLConnection conn) throws IOException {
|
||||
String body = "{}";
|
||||
try (BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
|
||||
StringBuilder response = new StringBuilder();
|
||||
String responseLine = null;
|
||||
while ((responseLine = br.readLine()) != null) {
|
||||
response.append(responseLine.trim());
|
||||
}
|
||||
|
||||
body = response.toString();
|
||||
|
||||
}
|
||||
return body;
|
||||
}
|
||||
|
||||
/**
|
||||
* Associates metadata information to the current deposition
|
||||
*
|
||||
* @param metadata the metadata
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
*/
|
||||
public int sendMretadata(String metadata) throws IOException {
|
||||
|
||||
URL url = new URL(urlString + "/" + deposition_id);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("PUT");
|
||||
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
byte[] input = metadata.getBytes("utf-8");
|
||||
os.write(input, 0, input.length);
|
||||
|
||||
}
|
||||
|
||||
final int responseCode = conn.getResponseCode();
|
||||
conn.disconnect();
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + getBody(conn));
|
||||
|
||||
return responseCode;
|
||||
|
||||
}
|
||||
|
||||
private boolean checkOKStatus(int responseCode) {
|
||||
|
||||
if (HttpURLConnection.HTTP_OK != responseCode ||
|
||||
HttpURLConnection.HTTP_CREATED != responseCode)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* To publish the current deposition. It works for both new deposition or new version of an old deposition
|
||||
*
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
*/
|
||||
@Deprecated
|
||||
public int publish() throws IOException {
|
||||
|
||||
String json = "{}";
|
||||
|
||||
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
|
||||
|
||||
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
|
||||
|
||||
Request request = new Request.Builder()
|
||||
.url(urlString + "/" + deposition_id + "/actions/publish")
|
||||
.addHeader("Authorization", "Bearer " + access_token)
|
||||
.post(body)
|
||||
.build();
|
||||
|
||||
try (Response response = httpClient.newCall(request).execute()) {
|
||||
|
||||
if (!response.isSuccessful())
|
||||
throw new IOException("Unexpected code " + response + response.body().string());
|
||||
|
||||
return response.code();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
|
||||
* for the new version.
|
||||
*
|
||||
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
|
||||
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
||||
* concept_rec_id = 656930
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
* @throws MissingConceptDoiException
|
||||
*/
|
||||
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
||||
setDepositionId(concept_rec_id, 1);
|
||||
String json = "{}";
|
||||
|
||||
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("POST");
|
||||
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
byte[] input = json.getBytes("utf-8");
|
||||
os.write(input, 0, input.length);
|
||||
|
||||
}
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
|
||||
conn.disconnect();
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||
String latest_draft = zenodoModel.getLinks().getLatest_draft();
|
||||
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
|
||||
bucket = getBucket(latest_draft);
|
||||
|
||||
return responseCode;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* To finish uploading a version or new deposition not published
|
||||
* It sets the deposition_id and the bucket to be used
|
||||
*
|
||||
*
|
||||
* @param deposition_id the deposition id of the not yet published upload
|
||||
* concept_rec_id = 656930
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
* @throws MissingConceptDoiException
|
||||
*/
|
||||
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
|
||||
|
||||
this.deposition_id = deposition_id;
|
||||
|
||||
String json = "{}";
|
||||
|
||||
URL url = new URL(urlString + "/" + deposition_id);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setRequestMethod("POST");
|
||||
conn.setDoOutput(true);
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
byte[] input = json.getBytes("utf-8");
|
||||
os.write(input, 0, input.length);
|
||||
}
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
conn.disconnect();
|
||||
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||
bucket = zenodoModel.getLinks().getBucket();
|
||||
|
||||
return responseCode;
|
||||
|
||||
}
|
||||
|
||||
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
|
||||
|
||||
ZenodoModelList zenodoModelList = new Gson()
|
||||
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
|
||||
|
||||
for (ZenodoModel zm : zenodoModelList) {
|
||||
if (zm.getConceptrecid().equals(concept_rec_id)) {
|
||||
deposition_id = zm.getId();
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (zenodoModelList.size() == 0)
|
||||
throw new MissingConceptDoiException(
|
||||
"The concept record id specified was missing in the list of depositions");
|
||||
setDepositionId(concept_rec_id, page + 1);
|
||||
|
||||
}
|
||||
|
||||
private String getPrevDepositions(String page) throws IOException {
|
||||
|
||||
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
|
||||
urlBuilder.addQueryParameter("page", page);
|
||||
|
||||
URL url = new URL(urlBuilder.build().toString());
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("GET");
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
|
||||
conn.disconnect();
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
return body;
|
||||
|
||||
}
|
||||
|
||||
private String getBucket(String inputUurl) throws IOException {
|
||||
|
||||
URL url = new URL(inputUurl);
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
|
||||
conn.setDoOutput(true);
|
||||
conn.setRequestMethod("GET");
|
||||
|
||||
String body = getBody(conn);
|
||||
|
||||
int responseCode = conn.getResponseCode();
|
||||
|
||||
conn.disconnect();
|
||||
if (!checkOKStatus(responseCode))
|
||||
throw new IOException("Unexpected code " + responseCode + body);
|
||||
|
||||
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
|
||||
|
||||
return zenodoModel.getLinks().getBucket();
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,14 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
public class Community {
|
||||
private String identifier;
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
}
|
|
@ -1,47 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
public class Creator {
|
||||
private String affiliation;
|
||||
private String name;
|
||||
private String orcid;
|
||||
|
||||
public String getAffiliation() {
|
||||
return affiliation;
|
||||
}
|
||||
|
||||
public void setAffiliation(String affiliation) {
|
||||
this.affiliation = affiliation;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getOrcid() {
|
||||
return orcid;
|
||||
}
|
||||
|
||||
public void setOrcid(String orcid) {
|
||||
this.orcid = orcid;
|
||||
}
|
||||
|
||||
public static Creator newInstance(String name, String affiliation, String orcid) {
|
||||
Creator c = new Creator();
|
||||
if (name != null) {
|
||||
c.name = name;
|
||||
}
|
||||
if (affiliation != null) {
|
||||
c.affiliation = affiliation;
|
||||
}
|
||||
if (orcid != null) {
|
||||
c.orcid = orcid;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class File implements Serializable {
|
||||
private String checksum;
|
||||
private String filename;
|
||||
private long filesize;
|
||||
private String id;
|
||||
|
||||
public String getChecksum() {
|
||||
return checksum;
|
||||
}
|
||||
|
||||
public void setChecksum(String checksum) {
|
||||
this.checksum = checksum;
|
||||
}
|
||||
|
||||
public String getFilename() {
|
||||
return filename;
|
||||
}
|
||||
|
||||
public void setFilename(String filename) {
|
||||
this.filename = filename;
|
||||
}
|
||||
|
||||
public long getFilesize() {
|
||||
return filesize;
|
||||
}
|
||||
|
||||
public void setFilesize(long filesize) {
|
||||
this.filesize = filesize;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Grant implements Serializable {
|
||||
private String id;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public static Grant newInstance(String id) {
|
||||
Grant g = new Grant();
|
||||
g.id = id;
|
||||
|
||||
return g;
|
||||
}
|
||||
}
|
|
@ -1,92 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Links implements Serializable {
|
||||
|
||||
private String bucket;
|
||||
|
||||
private String discard;
|
||||
|
||||
private String edit;
|
||||
private String files;
|
||||
private String html;
|
||||
private String latest_draft;
|
||||
private String latest_draft_html;
|
||||
private String publish;
|
||||
|
||||
private String self;
|
||||
|
||||
public String getBucket() {
|
||||
return bucket;
|
||||
}
|
||||
|
||||
public void setBucket(String bucket) {
|
||||
this.bucket = bucket;
|
||||
}
|
||||
|
||||
public String getDiscard() {
|
||||
return discard;
|
||||
}
|
||||
|
||||
public void setDiscard(String discard) {
|
||||
this.discard = discard;
|
||||
}
|
||||
|
||||
public String getEdit() {
|
||||
return edit;
|
||||
}
|
||||
|
||||
public void setEdit(String edit) {
|
||||
this.edit = edit;
|
||||
}
|
||||
|
||||
public String getFiles() {
|
||||
return files;
|
||||
}
|
||||
|
||||
public void setFiles(String files) {
|
||||
this.files = files;
|
||||
}
|
||||
|
||||
public String getHtml() {
|
||||
return html;
|
||||
}
|
||||
|
||||
public void setHtml(String html) {
|
||||
this.html = html;
|
||||
}
|
||||
|
||||
public String getLatest_draft() {
|
||||
return latest_draft;
|
||||
}
|
||||
|
||||
public void setLatest_draft(String latest_draft) {
|
||||
this.latest_draft = latest_draft;
|
||||
}
|
||||
|
||||
public String getLatest_draft_html() {
|
||||
return latest_draft_html;
|
||||
}
|
||||
|
||||
public void setLatest_draft_html(String latest_draft_html) {
|
||||
this.latest_draft_html = latest_draft_html;
|
||||
}
|
||||
|
||||
public String getPublish() {
|
||||
return publish;
|
||||
}
|
||||
|
||||
public void setPublish(String publish) {
|
||||
this.publish = publish;
|
||||
}
|
||||
|
||||
public String getSelf() {
|
||||
return self;
|
||||
}
|
||||
|
||||
public void setSelf(String self) {
|
||||
this.self = self;
|
||||
}
|
||||
}
|
|
@ -1,153 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Metadata implements Serializable {
|
||||
|
||||
private String access_right;
|
||||
private List<Community> communities;
|
||||
private List<Creator> creators;
|
||||
private String description;
|
||||
private String doi;
|
||||
private List<Grant> grants;
|
||||
private List<String> keywords;
|
||||
private String language;
|
||||
private String license;
|
||||
private PrereserveDoi prereserve_doi;
|
||||
private String publication_date;
|
||||
private List<String> references;
|
||||
private List<RelatedIdentifier> related_identifiers;
|
||||
private String title;
|
||||
private String upload_type;
|
||||
private String version;
|
||||
|
||||
public String getUpload_type() {
|
||||
return upload_type;
|
||||
}
|
||||
|
||||
public void setUpload_type(String upload_type) {
|
||||
this.upload_type = upload_type;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(String version) {
|
||||
this.version = version;
|
||||
}
|
||||
|
||||
public String getAccess_right() {
|
||||
return access_right;
|
||||
}
|
||||
|
||||
public void setAccess_right(String access_right) {
|
||||
this.access_right = access_right;
|
||||
}
|
||||
|
||||
public List<Community> getCommunities() {
|
||||
return communities;
|
||||
}
|
||||
|
||||
public void setCommunities(List<Community> communities) {
|
||||
this.communities = communities;
|
||||
}
|
||||
|
||||
public List<Creator> getCreators() {
|
||||
return creators;
|
||||
}
|
||||
|
||||
public void setCreators(List<Creator> creators) {
|
||||
this.creators = creators;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public List<Grant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
|
||||
public void setGrants(List<Grant> grants) {
|
||||
this.grants = grants;
|
||||
}
|
||||
|
||||
public List<String> getKeywords() {
|
||||
return keywords;
|
||||
}
|
||||
|
||||
public void setKeywords(List<String> keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public String getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
||||
public void setLicense(String license) {
|
||||
this.license = license;
|
||||
}
|
||||
|
||||
public PrereserveDoi getPrereserve_doi() {
|
||||
return prereserve_doi;
|
||||
}
|
||||
|
||||
public void setPrereserve_doi(PrereserveDoi prereserve_doi) {
|
||||
this.prereserve_doi = prereserve_doi;
|
||||
}
|
||||
|
||||
public String getPublication_date() {
|
||||
return publication_date;
|
||||
}
|
||||
|
||||
public void setPublication_date(String publication_date) {
|
||||
this.publication_date = publication_date;
|
||||
}
|
||||
|
||||
public List<String> getReferences() {
|
||||
return references;
|
||||
}
|
||||
|
||||
public void setReferences(List<String> references) {
|
||||
this.references = references;
|
||||
}
|
||||
|
||||
public List<RelatedIdentifier> getRelated_identifiers() {
|
||||
return related_identifiers;
|
||||
}
|
||||
|
||||
public void setRelated_identifiers(List<RelatedIdentifier> related_identifiers) {
|
||||
this.related_identifiers = related_identifiers;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class PrereserveDoi implements Serializable {
|
||||
private String doi;
|
||||
private String recid;
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getRecid() {
|
||||
return recid;
|
||||
}
|
||||
|
||||
public void setRecid(String recid) {
|
||||
this.recid = recid;
|
||||
}
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class RelatedIdentifier implements Serializable {
|
||||
private String identifier;
|
||||
private String relation;
|
||||
private String resource_type;
|
||||
private String scheme;
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
public String getRelation() {
|
||||
return relation;
|
||||
}
|
||||
|
||||
public void setRelation(String relation) {
|
||||
this.relation = relation;
|
||||
}
|
||||
|
||||
public String getResource_type() {
|
||||
return resource_type;
|
||||
}
|
||||
|
||||
public void setResource_type(String resource_type) {
|
||||
this.resource_type = resource_type;
|
||||
}
|
||||
|
||||
public String getScheme() {
|
||||
return scheme;
|
||||
}
|
||||
|
||||
public void setScheme(String scheme) {
|
||||
this.scheme = scheme;
|
||||
}
|
||||
}
|
|
@ -1,118 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class ZenodoModel implements Serializable {
|
||||
|
||||
private String conceptrecid;
|
||||
private String created;
|
||||
|
||||
private List<File> files;
|
||||
private String id;
|
||||
private Links links;
|
||||
private Metadata metadata;
|
||||
private String modified;
|
||||
private String owner;
|
||||
private String record_id;
|
||||
private String state;
|
||||
private boolean submitted;
|
||||
private String title;
|
||||
|
||||
public String getConceptrecid() {
|
||||
return conceptrecid;
|
||||
}
|
||||
|
||||
public void setConceptrecid(String conceptrecid) {
|
||||
this.conceptrecid = conceptrecid;
|
||||
}
|
||||
|
||||
public String getCreated() {
|
||||
return created;
|
||||
}
|
||||
|
||||
public void setCreated(String created) {
|
||||
this.created = created;
|
||||
}
|
||||
|
||||
public List<File> getFiles() {
|
||||
return files;
|
||||
}
|
||||
|
||||
public void setFiles(List<File> files) {
|
||||
this.files = files;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public Links getLinks() {
|
||||
return links;
|
||||
}
|
||||
|
||||
public void setLinks(Links links) {
|
||||
this.links = links;
|
||||
}
|
||||
|
||||
public Metadata getMetadata() {
|
||||
return metadata;
|
||||
}
|
||||
|
||||
public void setMetadata(Metadata metadata) {
|
||||
this.metadata = metadata;
|
||||
}
|
||||
|
||||
public String getModified() {
|
||||
return modified;
|
||||
}
|
||||
|
||||
public void setModified(String modified) {
|
||||
this.modified = modified;
|
||||
}
|
||||
|
||||
public String getOwner() {
|
||||
return owner;
|
||||
}
|
||||
|
||||
public void setOwner(String owner) {
|
||||
this.owner = owner;
|
||||
}
|
||||
|
||||
public String getRecord_id() {
|
||||
return record_id;
|
||||
}
|
||||
|
||||
public void setRecord_id(String record_id) {
|
||||
this.record_id = record_id;
|
||||
}
|
||||
|
||||
public String getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
public void setState(String state) {
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public boolean isSubmitted() {
|
||||
return submitted;
|
||||
}
|
||||
|
||||
public void setSubmitted(boolean submitted) {
|
||||
this.submitted = submitted;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
}
|
|
@ -1,7 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api.zenodo;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class ZenodoModelList extends ArrayList<ZenodoModel> {
|
||||
}
|
|
@ -10,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||
import com.wcohen.ss.JaroWinkler;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import scala.Tuple2;
|
||||
|
@ -146,10 +147,20 @@ public class AuthorMerger {
|
|||
}
|
||||
|
||||
public static String pidToComparableString(StructuredProperty pid) {
|
||||
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
|
||||
: "";
|
||||
return (pid.getQualifier() != null ? classid : "")
|
||||
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
||||
final String classId = Optional
|
||||
.ofNullable(pid)
|
||||
.map(
|
||||
p -> Optional
|
||||
.ofNullable(p.getQualifier())
|
||||
.map(Qualifier::getClassid)
|
||||
.map(String::toLowerCase)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
return Optional
|
||||
.ofNullable(pid)
|
||||
.map(StructuredProperty::getValue)
|
||||
.map(v -> String.join("|", v, classId))
|
||||
.orElse("");
|
||||
}
|
||||
|
||||
public static int countAuthorsPids(List<Author> authors) {
|
||||
|
|
|
@ -14,7 +14,7 @@ import java.util.stream.Collectors;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.ReduceFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -135,10 +135,10 @@ public class GroupEntitiesSparkJob {
|
|||
.applyCoarVocabularies(entity, vocs),
|
||||
OAFENTITY_KRYO_ENC)
|
||||
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
||||
.reduceGroups((ReduceFunction<OafEntity>) MergeUtils::checkedMerge)
|
||||
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, OafEntity>, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||
t._2().getClass().getName(), t._2()),
|
||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||
t.getClass().getName(), t),
|
||||
Encoders.tuple(Encoders.STRING(), OAFENTITY_KRYO_ENC));
|
||||
|
||||
// pivot on "_1" (classname of the entity)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class DoiCleaningRule {
|
||||
|
||||
public static String clean(final String doi) {
|
||||
|
@ -11,4 +13,26 @@ public class DoiCleaningRule {
|
|||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||
}
|
||||
|
||||
public static String normalizeDoi(final String input) {
|
||||
if (input == null)
|
||||
return null;
|
||||
final String replaced = input
|
||||
.replaceAll("\\n|\\r|\\t|\\s", "")
|
||||
.toLowerCase()
|
||||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||
if (StringUtils.isEmpty(replaced))
|
||||
return null;
|
||||
|
||||
if (!replaced.contains("10."))
|
||||
return null;
|
||||
|
||||
final String ret = replaced.substring(replaced.indexOf("10."));
|
||||
|
||||
if (!ret.startsWith(CleaningFunctions.DOI_PREFIX))
|
||||
return null;
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -92,6 +92,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
INVALID_AUTHOR_NAMES.add("null anonymous");
|
||||
INVALID_AUTHOR_NAMES.add("unbekannt");
|
||||
INVALID_AUTHOR_NAMES.add("unknown");
|
||||
INVALID_AUTHOR_NAMES.add("autor, Sin");
|
||||
INVALID_AUTHOR_NAMES.add("Desconocido / Inconnu,");
|
||||
|
||||
INVALID_URL_HOSTS.add("creativecommons.org");
|
||||
INVALID_URL_HOSTS.add("www.academia.edu");
|
||||
|
@ -117,7 +119,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.getContext()
|
||||
.stream()
|
||||
.filter(c -> !StringUtils.startsWith(c.getId().toLowerCase(), contextId))
|
||||
.collect(Collectors.toList()));
|
||||
.collect(Collectors.toCollection(ArrayList::new)));
|
||||
}
|
||||
return (T) res;
|
||||
} else {
|
||||
|
@ -1001,4 +1003,41 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.orElse(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements bad and ugly things that we should get rid of ASAP.
|
||||
*
|
||||
* @param value
|
||||
* @return
|
||||
* @param <T>
|
||||
*/
|
||||
public static <T extends Oaf> T dedicatedUglyHacks(T value) {
|
||||
if (value instanceof OafEntity) {
|
||||
if (value instanceof Result) {
|
||||
final Result r = (Result) value;
|
||||
|
||||
// Fix for AMS Acta
|
||||
Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.filter(
|
||||
i -> Optional
|
||||
.ofNullable(i.getHostedby())
|
||||
.map(KeyValue::getKey)
|
||||
.map(dsId -> dsId.equals("10|re3data_____::4cc76bed7ce2fb95fd8e7a2dfde16016"))
|
||||
.orElse(false)))
|
||||
.ifPresent(instance -> instance.forEach(i -> {
|
||||
if (Optional
|
||||
.ofNullable(i.getPid())
|
||||
.map(pid -> pid.stream().noneMatch(p -> p.getValue().startsWith("10.6092/unibo/amsacta")))
|
||||
.orElse(false)) {
|
||||
i.setHostedby(UNKNOWN_REPOSITORY);
|
||||
}
|
||||
}));
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class MergeEntitiesComparator implements Comparator<Oaf> {
|
||||
static final List<String> PID_AUTHORITIES = Arrays
|
||||
.asList(
|
||||
ModelConstants.ARXIV_ID,
|
||||
ModelConstants.PUBMED_CENTRAL_ID,
|
||||
ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
|
||||
ModelConstants.DATACITE_ID,
|
||||
ModelConstants.CROSSREF_ID);
|
||||
|
||||
static final List<String> RESULT_TYPES = Arrays
|
||||
.asList(
|
||||
ModelConstants.ORP_RESULTTYPE_CLASSID,
|
||||
ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
|
||||
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
||||
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||
|
||||
public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
|
||||
|
||||
@Override
|
||||
public int compare(Oaf left, Oaf right) {
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return -1;
|
||||
if (right == null)
|
||||
return 1;
|
||||
|
||||
int res = 0;
|
||||
|
||||
// pid authority
|
||||
int cfp1 = Optional
|
||||
.ofNullable(left.getCollectedfrom())
|
||||
.map(
|
||||
cf -> cf
|
||||
.stream()
|
||||
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
|
||||
.max(Integer::compare)
|
||||
.orElse(-1))
|
||||
.orElse(-1);
|
||||
int cfp2 = Optional
|
||||
.ofNullable(right.getCollectedfrom())
|
||||
.map(
|
||||
cf -> cf
|
||||
.stream()
|
||||
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
|
||||
.max(Integer::compare)
|
||||
.orElse(-1))
|
||||
.orElse(-1);
|
||||
|
||||
if (cfp1 >= 0 && cfp1 > cfp2) {
|
||||
return 1;
|
||||
} else if (cfp2 >= 0 && cfp2 > cfp1) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// trust
|
||||
if (left.getDataInfo() != null && right.getDataInfo() != null) {
|
||||
res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
|
||||
}
|
||||
|
||||
// result type
|
||||
if (res == 0) {
|
||||
if (left instanceof Result && right instanceof Result) {
|
||||
Result r1 = (Result) left;
|
||||
Result r2 = (Result) right;
|
||||
|
||||
if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
|
||||
if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
|
||||
return -1;
|
||||
}
|
||||
} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
|
||||
int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
|
||||
|
||||
if (rt1 >= 0 && rt1 > rt2) {
|
||||
return 1;
|
||||
} else if (rt2 >= 0 && rt2 > rt1) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// id
|
||||
if (res == 0) {
|
||||
if (left instanceof OafEntity && right instanceof OafEntity) {
|
||||
res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -5,7 +5,11 @@ import static com.google.common.base.Preconditions.checkArgument;
|
|||
import static org.apache.commons.lang3.ObjectUtils.firstNonNull;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeParseException;
|
||||
import java.util.*;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
|
@ -19,22 +23,51 @@ import org.apache.commons.lang3.tuple.Pair;
|
|||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||
import com.google.common.base.Joiner;
|
||||
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class MergeUtils {
|
||||
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
||||
return mergeGroup(s, oafEntityIterator, true);
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T checkedMerge(final T left, final T right) {
|
||||
return (T) merge(left, right, false);
|
||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
||||
return mergeGroup(s, oafEntityIterator, false);
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
||||
boolean checkDelegateAuthority) {
|
||||
|
||||
ArrayList<T> sortedEntities = new ArrayList<>();
|
||||
oafEntityIterator.forEachRemaining(sortedEntities::add);
|
||||
sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed());
|
||||
|
||||
Iterator<T> it = sortedEntities.iterator();
|
||||
T merged = it.next();
|
||||
|
||||
while (it.hasNext()) {
|
||||
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
||||
}
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
||||
return (T) merge(left, right, checkDelegateAuthority);
|
||||
}
|
||||
|
||||
public static <T extends Result, E extends Result> Result mergeResult(final T left, final E right) {
|
||||
return (Result) merge(left, right, false);
|
||||
}
|
||||
|
||||
public static Oaf merge(final Oaf left, final Oaf right) {
|
||||
return merge(left, right, false);
|
||||
}
|
||||
|
||||
public static Oaf merge(final Oaf left, final Oaf right, boolean checkDelegatedAuthority) {
|
||||
static Oaf merge(final Oaf left, final Oaf right, boolean checkDelegatedAuthority) {
|
||||
if (sameClass(left, right, OafEntity.class)) {
|
||||
return mergeEntities(left, right, checkDelegatedAuthority);
|
||||
} else if (sameClass(left, right, Relation.class)) {
|
||||
|
@ -72,7 +105,7 @@ public class MergeUtils {
|
|||
return mergeSoftware((Software) left, (Software) right);
|
||||
}
|
||||
|
||||
return mergeResult((Result) left, (Result) right);
|
||||
return mergeResultFields((Result) left, (Result) right);
|
||||
} else if (sameClass(left, right, Datasource.class)) {
|
||||
// TODO
|
||||
final int trust = compareTrust(left, right);
|
||||
|
@ -95,7 +128,7 @@ public class MergeUtils {
|
|||
* https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
|
||||
* such version.
|
||||
* <p>
|
||||
* Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator}
|
||||
* Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator}
|
||||
* and proceeds with the canonical property merging.
|
||||
*
|
||||
* @param left
|
||||
|
@ -113,11 +146,12 @@ public class MergeUtils {
|
|||
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
|
||||
return right;
|
||||
}
|
||||
|
||||
// TODO: raise trust to have preferred fields from one or the other??
|
||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
||||
return mergeResult(left, right);
|
||||
if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) {
|
||||
return mergeResultFields(left, right);
|
||||
} else {
|
||||
return mergeResult(right, left);
|
||||
return mergeResultFields(right, left);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -177,9 +211,9 @@ public class MergeUtils {
|
|||
|
||||
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
|
||||
Function<T, K> keyExtractor, BinaryOperator<T> merger) {
|
||||
if (left == null) {
|
||||
return right;
|
||||
} else if (right == null) {
|
||||
if (left == null || left.isEmpty()) {
|
||||
return right != null ? right : new ArrayList<>();
|
||||
} else if (right == null || right.isEmpty()) {
|
||||
return left;
|
||||
}
|
||||
|
||||
|
@ -190,7 +224,7 @@ public class MergeUtils {
|
|||
.concat(h.stream(), l.stream())
|
||||
.filter(Objects::nonNull)
|
||||
.distinct()
|
||||
.collect(Collectors.toMap(keyExtractor, v -> v, merger))
|
||||
.collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new))
|
||||
.values());
|
||||
}
|
||||
|
||||
|
@ -226,7 +260,13 @@ public class MergeUtils {
|
|||
}
|
||||
|
||||
// TODO review
|
||||
private static List<KeyValue> mergeKeyValue(List<KeyValue> left, List<KeyValue> right, int trust) {
|
||||
private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) {
|
||||
if (left == null) {
|
||||
return right;
|
||||
} else if (right == null) {
|
||||
return left;
|
||||
}
|
||||
|
||||
if (trust < 0) {
|
||||
List<KeyValue> s = left;
|
||||
left = right;
|
||||
|
@ -234,8 +274,9 @@ public class MergeUtils {
|
|||
}
|
||||
|
||||
HashMap<String, KeyValue> values = new HashMap<>();
|
||||
left.forEach(kv -> values.put(kv.getKey(), kv));
|
||||
right.forEach(kv -> values.putIfAbsent(kv.getKey(), kv));
|
||||
|
||||
Optional.ofNullable(left).ifPresent(l -> l.forEach(kv -> values.put(kv.getKey(), kv)));
|
||||
Optional.ofNullable(right).ifPresent(r -> r.forEach(kv -> values.putIfAbsent(kv.getKey(), kv)));
|
||||
|
||||
return new ArrayList<>(values.values());
|
||||
}
|
||||
|
@ -268,8 +309,7 @@ public class MergeUtils {
|
|||
*/
|
||||
private static <T extends Oaf> T mergeOafFields(T merged, T enrich, int trust) {
|
||||
|
||||
// TODO: union of all values, but what does it mean with KeyValue pairs???
|
||||
merged.setCollectedfrom(mergeKeyValue(merged.getCollectedfrom(), enrich.getCollectedfrom(), trust));
|
||||
merged.setCollectedfrom(mergeByKey(merged.getCollectedfrom(), enrich.getCollectedfrom(), trust));
|
||||
merged.setDataInfo(chooseDataInfo(merged.getDataInfo(), enrich.getDataInfo(), trust));
|
||||
merged.setLastupdatetimestamp(max(merged.getLastupdatetimestamp(), enrich.getLastupdatetimestamp()));
|
||||
|
||||
|
@ -289,16 +329,13 @@ public class MergeUtils {
|
|||
|
||||
merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
|
||||
merged.setPid(unionDistinctLists(merged.getPid(), enrich.getPid(), trust));
|
||||
// dateofcollection mettere today quando si fa merge
|
||||
merged.setDateofcollection(chooseString(merged.getDateofcollection(), enrich.getDateofcollection(), trust));
|
||||
// setDateoftransformation mettere vuota in dedup, nota per Claudio
|
||||
merged.setDateofcollection(LocalDateTime.now().toString());
|
||||
merged
|
||||
.setDateoftransformation(
|
||||
chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust));
|
||||
// TODO: was missing in OafEntity.merge
|
||||
merged.setExtraInfo(unionDistinctLists(merged.getExtraInfo(), enrich.getExtraInfo(), trust));
|
||||
// oaiprovenanze da mettere a null quando si genera merge
|
||||
merged.setOaiprovenance(chooseReference(merged.getOaiprovenance(), enrich.getOaiprovenance(), trust));
|
||||
// When merging records OAI provenance becomes null
|
||||
merged.setOaiprovenance(null);
|
||||
merged.setMeasures(unionDistinctLists(merged.getMeasures(), enrich.getMeasures(), trust));
|
||||
|
||||
return merged;
|
||||
|
@ -330,12 +367,12 @@ public class MergeUtils {
|
|||
}
|
||||
|
||||
// TODO keyvalue merge
|
||||
merge.setProperties(mergeKeyValue(merge.getProperties(), enrich.getProperties(), trust));
|
||||
merge.setProperties(mergeByKey(merge.getProperties(), enrich.getProperties(), trust));
|
||||
|
||||
return merge;
|
||||
}
|
||||
|
||||
public static <T extends Result> T mergeResult(T original, T enrich) {
|
||||
private static <T extends Result> T mergeResultFields(T original, T enrich) {
|
||||
final int trust = compareTrust(original, enrich);
|
||||
T merge = mergeOafEntityFields(original, enrich, trust);
|
||||
|
||||
|
@ -345,73 +382,73 @@ public class MergeUtils {
|
|||
merge.setProcessingchargecurrency(enrich.getProcessingchargecurrency());
|
||||
}
|
||||
|
||||
// author = usare la stessa logica che in dedup
|
||||
merge.setAuthor(chooseReference(merge.getAuthor(), enrich.getAuthor(), trust));
|
||||
// il primo che mi arriva secondo l'ordinamento per priorita'
|
||||
merge.setResulttype(chooseReference(merge.getResulttype(), enrich.getResulttype(), trust));
|
||||
// gestito come il resulttype perche' e' un subtype
|
||||
merge.setMetaResourceType(chooseReference(merge.getMetaResourceType(), enrich.getMetaResourceType(), trust));
|
||||
// spostiamo nell'instance e qui prendo il primo che arriva
|
||||
merge.setLanguage(chooseReference(merge.getLanguage(), enrich.getLanguage(), trust));
|
||||
// country lasicamo,o cosi' -> parentesi sul datainfo
|
||||
merge.setCountry(unionDistinctLists(merge.getCountry(), enrich.getCountry(), trust));
|
||||
// ok
|
||||
merge.setSubject(unionDistinctLists(merge.getSubject(), enrich.getSubject(), trust));
|
||||
// union per priority quindi vanno in append
|
||||
merge.setTitle(unionTitle(merge.getTitle(), enrich.getTitle(), trust));
|
||||
// ok
|
||||
merge.setRelevantdate(unionDistinctLists(merge.getRelevantdate(), enrich.getRelevantdate(), trust));
|
||||
// prima trust e poi longest list
|
||||
merge.setDescription(longestLists(merge.getDescription(), enrich.getDescription()));
|
||||
// trust piu' alto e poi piu' vecchia
|
||||
merge.setDateofacceptance(chooseReference(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust));
|
||||
// ok, ma publisher va messo ripetibile
|
||||
merge.setPublisher(chooseReference(merge.getPublisher(), enrich.getPublisher(), trust));
|
||||
// ok
|
||||
merge.setEmbargoenddate(chooseReference(merge.getEmbargoenddate(), enrich.getEmbargoenddate(), trust));
|
||||
// ok
|
||||
merge.setAuthor(mergeAuthors(merge.getAuthor(), enrich.getAuthor(), trust));
|
||||
|
||||
// keep merge value if present
|
||||
if (merge.getResulttype() == null) {
|
||||
merge.setResulttype(enrich.getResulttype());
|
||||
merge.setMetaResourceType(enrich.getMetaResourceType());
|
||||
}
|
||||
|
||||
// should be an instance attribute, get the first non-null value
|
||||
merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage()));
|
||||
|
||||
// distinct countries, do not manage datainfo
|
||||
merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
|
||||
|
||||
// distinct subjects
|
||||
merge.setSubject(mergeStructuredProperties(merge.getSubject(), enrich.getSubject(), trust));
|
||||
|
||||
// distinct titles
|
||||
merge.setTitle(mergeStructuredProperties(merge.getTitle(), enrich.getTitle(), trust));
|
||||
|
||||
merge.setRelevantdate(mergeStructuredProperties(merge.getRelevantdate(), enrich.getRelevantdate(), trust));
|
||||
|
||||
if (merge.getDescription() == null || merge.getDescription().isEmpty() || trust == 0) {
|
||||
merge.setDescription(longestLists(merge.getDescription(), enrich.getDescription()));
|
||||
}
|
||||
|
||||
merge
|
||||
.setDateofacceptance(
|
||||
mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust));
|
||||
|
||||
merge.setPublisher(coalesce(merge.getPublisher(), enrich.getPublisher()));
|
||||
merge.setEmbargoenddate(coalesce(merge.getEmbargoenddate(), enrich.getEmbargoenddate()));
|
||||
merge.setSource(unionDistinctLists(merge.getSource(), enrich.getSource(), trust));
|
||||
// ok
|
||||
merge.setFulltext(unionDistinctLists(merge.getFulltext(), enrich.getFulltext(), trust));
|
||||
// ok
|
||||
merge.setFormat(unionDistinctLists(merge.getFormat(), enrich.getFormat(), trust));
|
||||
// ok
|
||||
merge.setContributor(unionDistinctLists(merge.getContributor(), enrich.getContributor(), trust));
|
||||
|
||||
// prima prendo l'higher trust, su questo prendo il valore migliore nelle istanze TODO
|
||||
// trust maggiore ma a parita' di trust il piu' specifico (base del vocabolario)
|
||||
// vedi note
|
||||
// cannot use com.google.common.base.Objects.firstNonNull as it throws NPE when both terms are null
|
||||
merge.setResourcetype(firstNonNull(merge.getResourcetype(), enrich.getResourcetype()));
|
||||
// this field might contain the original type from the raw metadata, no strategy yet to merge it
|
||||
merge.setResourcetype(coalesce(merge.getResourcetype(), enrich.getResourcetype()));
|
||||
|
||||
// ok
|
||||
merge.setCoverage(unionDistinctLists(merge.getCoverage(), enrich.getCoverage(), trust));
|
||||
|
||||
// most open ok
|
||||
if (enrich.getBestaccessright() != null
|
||||
&& new AccessRightComparator<>()
|
||||
.compare(enrich.getBestaccessright(), merge.getBestaccessright()) < 0) {
|
||||
merge.setBestaccessright(enrich.getBestaccessright());
|
||||
}
|
||||
|
||||
// TODO merge of datainfo given same id
|
||||
merge.setContext(unionDistinctLists(merge.getContext(), enrich.getContext(), trust));
|
||||
// merge datainfo for same context id
|
||||
merge.setContext(mergeLists(merge.getContext(), enrich.getContext(), trust, Context::getId, (r, l) -> {
|
||||
ArrayList<DataInfo> di = new ArrayList<>();
|
||||
di.addAll(r.getDataInfo());
|
||||
di.addAll(l.getDataInfo());
|
||||
r.setDataInfo(di);
|
||||
return r;
|
||||
}));
|
||||
|
||||
// ok
|
||||
merge
|
||||
.setExternalReference(
|
||||
unionDistinctLists(merge.getExternalReference(), enrich.getExternalReference(), trust));
|
||||
mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust));
|
||||
|
||||
// instance enrichment or union
|
||||
// review instance equals => add pid to comparision
|
||||
if (!isAnEnrichment(merge) && !isAnEnrichment(enrich))
|
||||
merge
|
||||
.setInstance(
|
||||
mergeLists(
|
||||
merge.getInstance(), enrich.getInstance(), trust,
|
||||
MergeUtils::instanceKeyExtractor,
|
||||
MergeUtils::instanceMerger));
|
||||
else {
|
||||
if (!isAnEnrichment(merge) && !isAnEnrichment(enrich)) {
|
||||
merge.setInstance(mergeInstances(merge.getInstance(), enrich.getInstance(), trust));
|
||||
} else {
|
||||
final List<Instance> enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance()
|
||||
: enrich.getInstance();
|
||||
final List<Instance> enrichedInstances = isAnEnrichment(merge) ? enrich.getInstance()
|
||||
|
@ -421,17 +458,135 @@ public class MergeUtils {
|
|||
merge.setInstance(enrichInstances(enrichedInstances, enrichmentInstances));
|
||||
}
|
||||
|
||||
merge.setEoscifguidelines(unionDistinctLists(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust));
|
||||
merge
|
||||
.setEoscifguidelines(
|
||||
mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust));
|
||||
merge.setIsGreen(booleanOR(merge.getIsGreen(), enrich.getIsGreen()));
|
||||
// OK but should be list of values
|
||||
merge.setOpenAccessColor(chooseReference(merge.getOpenAccessColor(), enrich.getOpenAccessColor(), trust));
|
||||
merge.setOpenAccessColor(coalesce(merge.getOpenAccessColor(), enrich.getOpenAccessColor()));
|
||||
merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal()));
|
||||
merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded()));
|
||||
|
||||
return merge;
|
||||
}
|
||||
|
||||
private static Field<String> mergeDateOfAcceptance(Field<String> merge, Field<String> enrich, int trust) {
|
||||
// higher trust then oldest date
|
||||
if ((merge == null || trust == 0) && enrich != null) {
|
||||
if (merge == null) {
|
||||
return enrich;
|
||||
} else {
|
||||
try {
|
||||
LocalDate merge_date = LocalDate.parse(merge.getValue(), DateTimeFormatter.ISO_DATE);
|
||||
try {
|
||||
LocalDate enrich_date = LocalDate.parse(enrich.getValue(), DateTimeFormatter.ISO_DATE);
|
||||
|
||||
if (enrich_date.getYear() > 1300
|
||||
&& (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) {
|
||||
return enrich;
|
||||
}
|
||||
} catch (NullPointerException | DateTimeParseException e) {
|
||||
return merge;
|
||||
}
|
||||
} catch (NullPointerException | DateTimeParseException e) {
|
||||
return enrich;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep value
|
||||
return merge;
|
||||
}
|
||||
|
||||
private static List<Instance> mergeInstances(List<Instance> v1, List<Instance> v2, int trust) {
|
||||
return mergeLists(
|
||||
v1, v2, trust,
|
||||
MergeUtils::instanceKeyExtractor,
|
||||
MergeUtils::instanceMerger);
|
||||
}
|
||||
|
||||
private static List<EoscIfGuidelines> mergeEosciifguidelines(List<EoscIfGuidelines> v1, List<EoscIfGuidelines> v2,
|
||||
int trust) {
|
||||
return mergeLists(
|
||||
v1, v2, trust, er -> Joiner
|
||||
.on("||")
|
||||
.useForNull("")
|
||||
.join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()),
|
||||
(r, l) -> r);
|
||||
|
||||
}
|
||||
|
||||
private static List<ExternalReference> mergeExternalReference(List<ExternalReference> v1,
|
||||
List<ExternalReference> v2, int trust) {
|
||||
return mergeLists(
|
||||
v1, v2, trust, er -> Joiner
|
||||
.on(',')
|
||||
.useForNull("")
|
||||
.join(
|
||||
er.getSitename(), er.getLabel(),
|
||||
er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(),
|
||||
er.getQuery(), toString(er.getDataInfo())),
|
||||
(r, l) -> r);
|
||||
}
|
||||
|
||||
private static String toString(DataInfo di) {
|
||||
return Joiner
|
||||
.on(',')
|
||||
.useForNull("")
|
||||
.join(
|
||||
di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(),
|
||||
di.getInferenceprovenance(), toString(di.getProvenanceaction()));
|
||||
}
|
||||
|
||||
private static String toString(Qualifier q) {
|
||||
return Joiner
|
||||
.on(',')
|
||||
.useForNull("")
|
||||
.join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename());
|
||||
}
|
||||
|
||||
private static String toString(StructuredProperty sp) {
|
||||
return Joiner
|
||||
.on(',')
|
||||
.useForNull("")
|
||||
.join(toString(sp.getQualifier()), sp.getValue());
|
||||
}
|
||||
|
||||
private static <T extends StructuredProperty> List<T> mergeStructuredProperties(List<T> v1, List<T> v2, int trust) {
|
||||
return mergeLists(v1, v2, trust, MergeUtils::toString, (r, l) -> r);
|
||||
}
|
||||
|
||||
private static <T extends Qualifier> List<T> mergeQualifiers(List<T> v1, List<T> v2, int trust) {
|
||||
return mergeLists(v1, v2, trust, MergeUtils::toString, (r, l) -> r);
|
||||
}
|
||||
|
||||
private static <T> T coalesce(T m, T e) {
|
||||
return m != null ? m : e;
|
||||
}
|
||||
|
||||
private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) {
|
||||
if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) {
|
||||
return e;
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
|
||||
List<List<Author>> authors = new ArrayList<>();
|
||||
if (author != null) {
|
||||
authors.add(author);
|
||||
}
|
||||
if (author1 != null) {
|
||||
authors.add(author1);
|
||||
}
|
||||
return AuthorMerger.merge(authors);
|
||||
}
|
||||
|
||||
private static String instanceKeyExtractor(Instance i) {
|
||||
// three levels of concatenating:
|
||||
// 1. ::
|
||||
// 2. @@
|
||||
// 3. ||
|
||||
return String
|
||||
.join(
|
||||
"::",
|
||||
|
@ -439,10 +594,10 @@ public class MergeUtils {
|
|||
kvKeyExtractor(i.getCollectedfrom()),
|
||||
qualifierKeyExtractor(i.getAccessright()),
|
||||
qualifierKeyExtractor(i.getInstancetype()),
|
||||
Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
|
||||
Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
|
||||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
|
||||
.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
|
||||
.orElse(null));
|
||||
}
|
||||
|
||||
|
@ -472,9 +627,9 @@ public class MergeUtils {
|
|||
MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1));
|
||||
i.setFulltext(selectFulltext(i1.getFulltext(), i2.getFulltext()));
|
||||
i.setDateofacceptance(selectOldestDate(i1.getDateofacceptance(), i2.getDateofacceptance()));
|
||||
i.setLicense(firstNonNull(i1.getLicense(), i2.getLicense()));
|
||||
i.setProcessingchargeamount(firstNonNull(i1.getProcessingchargeamount(), i2.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(firstNonNull(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency()));
|
||||
i.setLicense(coalesce(i1.getLicense(), i2.getLicense()));
|
||||
i.setProcessingchargeamount(coalesce(i1.getProcessingchargeamount(), i2.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(coalesce(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency()));
|
||||
i
|
||||
.setMeasures(
|
||||
mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1));
|
||||
|
@ -497,9 +652,14 @@ public class MergeUtils {
|
|||
}
|
||||
|
||||
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
|
||||
if (d1 == null || StringUtils.isBlank(d1.getValue())) {
|
||||
return d2;
|
||||
} else if (d2 == null || StringUtils.isBlank(d2.getValue())) {
|
||||
return d1;
|
||||
}
|
||||
|
||||
return Stream
|
||||
.of(d1, d2)
|
||||
.filter(Objects::nonNull)
|
||||
.min(
|
||||
Comparator
|
||||
.comparing(
|
||||
|
@ -546,13 +706,13 @@ public class MergeUtils {
|
|||
private static String spKeyExtractor(StructuredProperty sp) {
|
||||
return Optional
|
||||
.ofNullable(sp)
|
||||
.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
|
||||
.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
|
||||
int trust = compareTrust(original, enrich);
|
||||
final T merge = mergeResult(original, enrich);
|
||||
final T merge = mergeResultFields(original, enrich);
|
||||
|
||||
merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust));
|
||||
merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust));
|
||||
|
@ -563,7 +723,7 @@ public class MergeUtils {
|
|||
|
||||
private static <T extends Software> T mergeSoftware(T original, T enrich) {
|
||||
int trust = compareTrust(original, enrich);
|
||||
final T merge = mergeResult(original, enrich);
|
||||
final T merge = mergeResultFields(original, enrich);
|
||||
|
||||
merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust));
|
||||
merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
|
||||
|
@ -577,7 +737,7 @@ public class MergeUtils {
|
|||
|
||||
private static <T extends Dataset> T mergeDataset(T original, T enrich) {
|
||||
int trust = compareTrust(original, enrich);
|
||||
T merge = mergeResult(original, enrich);
|
||||
T merge = mergeResultFields(original, enrich);
|
||||
|
||||
merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust));
|
||||
merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust));
|
||||
|
@ -596,7 +756,7 @@ public class MergeUtils {
|
|||
|
||||
public static <T extends Publication> T mergePublication(T original, T enrich) {
|
||||
final int trust = compareTrust(original, enrich);
|
||||
T merged = mergeResult(original, enrich);
|
||||
T merged = mergeResultFields(original, enrich);
|
||||
|
||||
merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust));
|
||||
|
||||
|
@ -693,7 +853,7 @@ public class MergeUtils {
|
|||
* @param b the b
|
||||
* @return the list
|
||||
*/
|
||||
public static List<Field<String>> longestLists(List<Field<String>> a, List<Field<String>> b) {
|
||||
private static List<Field<String>> longestLists(List<Field<String>> a, List<Field<String>> b) {
|
||||
if (a == null || b == null)
|
||||
return a == null ? b : a;
|
||||
|
||||
|
@ -714,9 +874,11 @@ public class MergeUtils {
|
|||
if (toEnrichInstances == null) {
|
||||
return enrichmentResult;
|
||||
}
|
||||
if (enrichmentInstances == null) {
|
||||
return enrichmentResult;
|
||||
|
||||
if (enrichmentInstances == null || enrichmentInstances.isEmpty()) {
|
||||
return toEnrichInstances;
|
||||
}
|
||||
|
||||
Map<String, Instance> ri = toInstanceMap(enrichmentInstances);
|
||||
|
||||
toEnrichInstances.forEach(i -> {
|
||||
|
@ -844,8 +1006,8 @@ public class MergeUtils {
|
|||
* single attribute only if in the current instance is missing
|
||||
* The only repeatable field enriched is measures
|
||||
*
|
||||
* @param merge the current instance
|
||||
* @param enrichment the enrichment instance
|
||||
* @param merge the current instance
|
||||
* @param enrichment the enrichment instance
|
||||
*/
|
||||
private static void applyEnrichment(final Instance merge, final Instance enrichment) {
|
||||
if (merge == null || enrichment == null)
|
||||
|
|
|
@ -9,10 +9,18 @@ public class OrganizationPidComparator implements Comparator<StructuredProperty>
|
|||
|
||||
@Override
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
if (left == null) {
|
||||
return right == null ? 0 : -1;
|
||||
} else if (right == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals(PidType.openorgs))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.openorgs))
|
||||
|
|
|
@ -4,7 +4,6 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
|||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
/**
|
||||
* Comparator for sorting the values from the dnet:review_levels vocabulary, implements the following ordering
|
||||
|
@ -15,10 +14,18 @@ public class RefereedComparator implements Comparator<Qualifier> {
|
|||
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
if (left == null || left.getClassid() == null) {
|
||||
return (right == null || right.getClassid() == null) ? 0 : -1;
|
||||
} else if (right == null || right.getClassid() == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if ("0001".equals(lClass))
|
||||
return -1;
|
||||
if ("0001".equals(rClass))
|
||||
|
|
|
@ -13,6 +13,9 @@ public class ResultPidComparator implements Comparator<StructuredProperty> {
|
|||
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals(PidType.doi))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.doi))
|
||||
|
|
|
@ -1,77 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class ResultTypeComparator implements Comparator<Result> {
|
||||
|
||||
@Override
|
||||
public int compare(Result left, Result right) {
|
||||
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return 1;
|
||||
if (right == null)
|
||||
return -1;
|
||||
|
||||
HashSet<String> lCf = getCollectedFromIds(left);
|
||||
HashSet<String> rCf = getCollectedFromIds(right);
|
||||
|
||||
if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
|
||||
return -1;
|
||||
}
|
||||
if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
String lClass = left.getResulttype().getClassid();
|
||||
String rClass = right.getResulttype().getClassid();
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
|
||||
protected HashSet<String> getCollectedFromIds(Result left) {
|
||||
return Optional
|
||||
.ofNullable(left.getCollectedfrom())
|
||||
.map(
|
||||
cf -> cf
|
||||
.stream()
|
||||
.map(KeyValue::getKey)
|
||||
.collect(Collectors.toCollection(HashSet::new)))
|
||||
.orElse(new HashSet<>());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
|
||||
package eu.dnetlib.pace.common;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
/**
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public class PaceCommonUtils {
|
||||
|
||||
// transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
s.chars().forEach(ch -> {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||
});
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
} catch (Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
public static String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||
// strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public static String unicodeNormalization(final String s) {
|
||||
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils
|
||||
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
|
||||
}
|
|
@ -12,7 +12,7 @@ import com.google.common.collect.Iterables;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.common.PaceCommonUtils;
|
||||
import eu.dnetlib.pace.util.Capitalise;
|
||||
import eu.dnetlib.pace.util.DotAbbreviations;
|
||||
|
||||
|
@ -86,7 +86,7 @@ public class Person {
|
|||
|
||||
private List<String> splitTerms(final String s) {
|
||||
if (particles == null) {
|
||||
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
}
|
||||
|
||||
final List<String> list = Lists.newArrayList();
|
|
@ -15,4 +15,4 @@ public class Capitalise implements Function<String, String> {
|
|||
public String apply(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||
}
|
||||
};
|
||||
}
|
|
@ -8,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
|
|||
public String apply(String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
};
|
||||
}
|
|
@ -1,5 +1,8 @@
|
|||
package eu.dnetlib.dhp.application
|
||||
|
||||
import eu.dnetlib.dhp.common.Constants
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
/** This is the main Interface SparkApplication
|
||||
|
@ -70,4 +73,13 @@ abstract class AbstractScalaApplication(
|
|||
.getOrCreate()
|
||||
}
|
||||
|
||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||
val total_items = spark.read.text(targetPath).count()
|
||||
writeHdfsFile(
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
s"$total_items",
|
||||
outputBasePath + Constants.MDSTORE_SIZE_PATH
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,109 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.api;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Disabled
|
||||
class ZenodoAPIClientTest {
|
||||
|
||||
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
|
||||
private final String ACCESS_TOKEN = "";
|
||||
|
||||
private final String CONCEPT_REC_ID = "657113";
|
||||
|
||||
private final String depositionId = "674915";
|
||||
|
||||
@Test
|
||||
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
|
||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||
ACCESS_TOKEN);
|
||||
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
|
||||
|
||||
File file = new File(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
|
||||
.getPath());
|
||||
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
|
||||
|
||||
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
|
||||
|
||||
Assertions.assertEquals(200, client.sendMretadata(metadata));
|
||||
|
||||
Assertions.assertEquals(202, client.publish());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNewDeposition() throws IOException {
|
||||
|
||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||
ACCESS_TOKEN);
|
||||
Assertions.assertEquals(201, client.newDeposition());
|
||||
|
||||
File file = new File(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
|
||||
.getPath());
|
||||
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
|
||||
|
||||
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
|
||||
|
||||
Assertions.assertEquals(200, client.sendMretadata(metadata));
|
||||
|
||||
Assertions.assertEquals(202, client.publish());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
|
||||
|
||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||
ACCESS_TOKEN);
|
||||
|
||||
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
|
||||
|
||||
File file = new File(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/common/api/newVersion")
|
||||
.getPath());
|
||||
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
|
||||
|
||||
Assertions.assertEquals(202, client.publish());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
|
||||
|
||||
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
|
||||
ACCESS_TOKEN);
|
||||
|
||||
Assertions.assertEquals(201, client.newVersion(CONCEPT_REC_ID));
|
||||
|
||||
File file = new File(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/common/api/newVersion2")
|
||||
.getPath());
|
||||
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
|
||||
|
||||
Assertions.assertEquals(202, client.publish());
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -63,7 +63,7 @@ public class MergeUtilsTest {
|
|||
assertEquals(1, d1.getCollectedfrom().size());
|
||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
|
||||
final Result p1d2 = MergeUtils.checkedMerge(p1, d2);
|
||||
final Result p1d2 = MergeUtils.checkedMerge(p1, d2, true);
|
||||
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
|
||||
assertTrue(p1d2 instanceof Publication);
|
||||
assertEquals(p1.getId(), p1d2.getId());
|
||||
|
@ -74,7 +74,7 @@ public class MergeUtilsTest {
|
|||
Publication p2 = read("publication_2.json", Publication.class);
|
||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||
|
||||
final Result p2d1 = MergeUtils.checkedMerge(p2, d1);
|
||||
final Result p2d1 = MergeUtils.checkedMerge(p2, d1, true);
|
||||
assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
|
||||
assertTrue(p2d1 instanceof Dataset);
|
||||
assertEquals(d1.getId(), p2d1.getId());
|
||||
|
@ -86,7 +86,7 @@ public class MergeUtilsTest {
|
|||
Publication p1 = read("publication_1.json", Publication.class);
|
||||
Publication p2 = read("publication_2.json", Publication.class);
|
||||
|
||||
Result p1p2 = MergeUtils.checkedMerge(p1, p2);
|
||||
Result p1p2 = MergeUtils.checkedMerge(p1, p2, true);
|
||||
assertTrue(p1p2 instanceof Publication);
|
||||
assertEquals(p1.getId(), p1p2.getId());
|
||||
assertEquals(2, p1p2.getCollectedfrom().size());
|
||||
|
|
|
@ -49,6 +49,12 @@
|
|||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
|
|
|
@ -20,7 +20,7 @@ public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
|||
return suffixPrefixChain(s, param("mod"));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
static Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
|
||||
// create the list of words from the string (remove short words)
|
||||
List<String> wordsList = Arrays
|
||||
|
@ -38,7 +38,7 @@ public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
|||
|
||||
}
|
||||
|
||||
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
|
||||
static private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
|
||||
|
||||
Set<String> set = Sets.newLinkedHashSet();
|
||||
switch (wordsList.size()) {
|
||||
|
@ -80,12 +80,16 @@ public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
|||
|
||||
}
|
||||
|
||||
private String suffix(String s, int len) {
|
||||
private static String suffix(String s, int len) {
|
||||
return s.substring(s.length() - len);
|
||||
}
|
||||
|
||||
private String prefix(String s, int len) {
|
||||
private static String prefix(String s, int len) {
|
||||
return s.substring(0, len);
|
||||
}
|
||||
|
||||
static public void main(String[] args) {
|
||||
String title = "MY LIFE AS A BOSON: THE STORY OF \"THE HIGGS\"".toLowerCase();
|
||||
System.out.println(suffixPrefixChain(title, 10));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,6 @@ package eu.dnetlib.pace.common;
|
|||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -14,19 +13,15 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
|
||||
/**
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public class AbstractPaceFunctions {
|
||||
public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||
|
||||
// city map to be used when translating the city names into codes
|
||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||
|
@ -41,9 +36,6 @@ public class AbstractPaceFunctions {
|
|||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
|
||||
// transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
// blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
|
||||
|
@ -51,8 +43,6 @@ public class AbstractPaceFunctions {
|
|||
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
||||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
// doi prefix for normalization
|
||||
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
||||
|
@ -129,25 +119,6 @@ public class AbstractPaceFunctions {
|
|||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
s.chars().forEach(ch -> {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||
});
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
} catch (Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
protected static String removeSymbols(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
|
@ -162,23 +133,6 @@ public class AbstractPaceFunctions {
|
|||
return s != null;
|
||||
}
|
||||
|
||||
public static String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||
// strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public static String utf8(final String s) {
|
||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
|
@ -233,22 +187,6 @@ public class AbstractPaceFunctions {
|
|||
return newset;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils
|
||||
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
@ -303,10 +241,6 @@ public class AbstractPaceFunctions {
|
|||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
|
||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
|
||||
public static String normalizePid(String pid) {
|
||||
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
||||
}
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
@ -11,6 +13,7 @@ import eu.dnetlib.pace.config.Config;
|
|||
import eu.dnetlib.pace.model.Person;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.util.AuthorMatchers;
|
||||
|
||||
@ComparatorClass("authorsMatch")
|
||||
public class AuthorsMatch extends AbstractListComparator {
|
||||
|
@ -41,24 +44,36 @@ public class AuthorsMatch extends AbstractListComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
public double compare(final List<String> left, final List<String> right, final Config conf) {
|
||||
if (left.isEmpty() || right.isEmpty())
|
||||
return -1;
|
||||
|
||||
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
|
||||
if (left.size() > SIZE_THRESHOLD || right.size() > SIZE_THRESHOLD)
|
||||
return 1.0;
|
||||
|
||||
int maxMiss = Integer.MAX_VALUE;
|
||||
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
|
||||
Double threshold = getDoubleParam("threshold");
|
||||
int maxMiss = Integer.MAX_VALUE;
|
||||
|
||||
if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) {
|
||||
maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size()));
|
||||
if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && left.size() == right.size()) {
|
||||
maxMiss = (int) Math.floor((1 - threshold) * Math.max(left.size(), right.size()));
|
||||
}
|
||||
|
||||
int common = 0;
|
||||
|
||||
List<String> a = new ArrayList<>(left);
|
||||
List<String> b = new ArrayList<>(right);
|
||||
|
||||
common += AuthorMatchers
|
||||
.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchEqualsIgnoreCase)
|
||||
.size() / 2;
|
||||
common += AuthorMatchers
|
||||
.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchOrderedTokenAndAbbreviations)
|
||||
.size() / 2;
|
||||
|
||||
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
|
||||
// compare each element of List1 with each element of List2
|
||||
int alreadyMatched = common;
|
||||
for (int i = 0; i < a.size(); i++) {
|
||||
Person p1 = new Person(a.get(i), false);
|
||||
|
||||
|
@ -123,13 +138,13 @@ public class AuthorsMatch extends AbstractListComparator {
|
|||
}
|
||||
}
|
||||
|
||||
if (i - common > maxMiss) {
|
||||
if (i - common - alreadyMatched > maxMiss) {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// normalization factor to compute the score
|
||||
int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common);
|
||||
int normFactor = left.size() == right.size() ? left.size() : (left.size() + right.size() - common);
|
||||
|
||||
if (TYPE.equals("percentage")) {
|
||||
return (double) common / normFactor;
|
||||
|
@ -160,5 +175,4 @@ public class AuthorsMatch extends AbstractListComparator {
|
|||
public String normalization(String s) {
|
||||
return normalize(utf8(cleanup(s)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("countryMatch")
|
||||
public class CountryMatch extends AbstractStringComparator {
|
||||
|
||||
public CountryMatch(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public CountryMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; // return -1 if a field is missing
|
||||
}
|
||||
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
|
||||
return -1.0; // return -1 if a country is UNKNOWN
|
||||
}
|
||||
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
}
|
|
@ -1,9 +1,10 @@
|
|||
package eu.dnetlib.dhp.enrich.orcid
|
||||
package eu.dnetlib.pace.util
|
||||
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
import scala.util.control.Breaks.{break, breakable}
|
||||
|
||||
object ORCIDAuthorMatchers {
|
||||
object AuthorMatchers {
|
||||
val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
|
||||
|
||||
val WORD_DIFF = 2
|
||||
|
@ -45,7 +46,8 @@ object ORCIDAuthorMatchers {
|
|||
var res: Boolean = false
|
||||
if (e1.length != 1 && e2.length != 1) {
|
||||
res = e1 == e2
|
||||
longMatches += 1
|
||||
if (res)
|
||||
longMatches += 1
|
||||
} else {
|
||||
res = true
|
||||
shortMatches += 1
|
||||
|
@ -62,4 +64,49 @@ object ORCIDAuthorMatchers {
|
|||
}
|
||||
longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length)
|
||||
}
|
||||
|
||||
def removeMatches(
|
||||
graph_authors: java.util.List[String],
|
||||
orcid_authors: java.util.List[String],
|
||||
matchingFunc: java.util.function.BiFunction[String,String,Boolean]
|
||||
) : java.util.List[String] = {
|
||||
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
|
||||
}
|
||||
|
||||
|
||||
def removeMatches(
|
||||
graph_authors: java.util.List[String],
|
||||
orcid_authors: java.util.List[String],
|
||||
matchingFunc: (String, String) => Boolean
|
||||
) : java.util.List[String] = {
|
||||
val matched = new java.util.ArrayList[String]()
|
||||
|
||||
if (graph_authors != null && !graph_authors.isEmpty) {
|
||||
val ait = graph_authors.iterator
|
||||
|
||||
while (ait.hasNext) {
|
||||
val author = ait.next()
|
||||
val oit = orcid_authors.iterator
|
||||
|
||||
breakable {
|
||||
while (oit.hasNext) {
|
||||
val orcid = oit.next()
|
||||
|
||||
if (matchingFunc(author, orcid)) {
|
||||
ait.remove()
|
||||
oit.remove()
|
||||
|
||||
matched.add(author)
|
||||
matched.add(orcid)
|
||||
|
||||
break()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matched
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -336,4 +336,23 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
System.out.println("compare = " + compare);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void countryMatch() {
|
||||
|
||||
CountryMatch countryMatch = new CountryMatch(params);
|
||||
|
||||
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = countryMatch.distance("CHILE", "UNKNOWN", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = countryMatch.distance("CHILE", "ITALY", conf);
|
||||
assertEquals(0.0, result);
|
||||
|
||||
result = countryMatch.distance("CHILE", "CHILE", conf);
|
||||
assertEquals(1.0, result);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -7,10 +7,10 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
|
||||
public class UtilTest {
|
||||
|
||||
|
@ -22,7 +22,7 @@ public class UtilTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
@Disabled
|
||||
public void paceResolverTest() {
|
||||
PaceResolver paceResolver = new PaceResolver();
|
||||
paceResolver.getComparator("keywordMatch", params);
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-shade-package</artifactId>
|
||||
<description>This module create a jar of all module dependencies</description>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<transformers>
|
||||
<transformer>
|
||||
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
|
||||
</transformer>
|
||||
<transformer />
|
||||
<transformer>
|
||||
<resource>META-INF/cxf/bus-extensions.txt</resource>
|
||||
</transformer>
|
||||
</transformers>
|
||||
<filters>
|
||||
<filter>
|
||||
<artifact>*:*</artifact>
|
||||
<excludes>
|
||||
<exclude>META-INF/maven/**</exclude>
|
||||
<exclude>META-INF/*.SF</exclude>
|
||||
<exclude>META-INF/*.DSA</exclude>
|
||||
<exclude>META-INF/*.RSA</exclude>
|
||||
</excludes>
|
||||
</filter>
|
||||
</filters>
|
||||
<relocations>
|
||||
<relocation>
|
||||
<pattern>com</pattern>
|
||||
<shadedPattern>repackaged.com.google.common</shadedPattern>
|
||||
<includes>
|
||||
<include>com.google.common.**</include>
|
||||
</includes>
|
||||
</relocation>
|
||||
</relocations>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.28</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<version>5.6.1</version>
|
||||
<scope>test</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>junit-jupiter-api</artifactId>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<artifactId>junit-jupiter-params</artifactId>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<artifactId>junit-jupiter-engine</artifactId>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-core</artifactId>
|
||||
<version>3.3.3</version>
|
||||
<scope>test</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>byte-buddy</artifactId>
|
||||
<groupId>net.bytebuddy</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<artifactId>byte-buddy-agent</artifactId>
|
||||
<groupId>net.bytebuddy</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-junit-jupiter</artifactId>
|
||||
<version>3.3.3</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
</project>
|
|
@ -1,14 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.promote;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
|
||||
|
||||
import java.util.function.BiFunction;
|
||||
|
||||
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||
|
||||
import java.util.function.BiFunction;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
|
||||
|
||||
/** OAF model merging support. */
|
||||
public class MergeAndGet {
|
||||
|
||||
|
|
|
@ -103,6 +103,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -156,6 +157,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -95,6 +95,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -125,6 +125,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -95,6 +95,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -103,6 +103,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -155,11 +156,12 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=2560
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
|
|
|
@ -95,6 +95,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -103,11 +103,12 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7000
|
||||
--conf spark.sql.shuffle.partitions=15000
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
|
@ -156,11 +157,12 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7000
|
||||
--conf spark.sql.shuffle.partitions=15000
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
|
|
|
@ -95,11 +95,12 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=10000
|
||||
--conf spark.sql.shuffle.partitions=15000
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||
|
|
|
@ -103,6 +103,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -155,11 +156,12 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=2560
|
||||
--conf spark.sql.shuffle.partitions=4000
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
|
|
|
@ -8,6 +8,7 @@ import static org.mockito.Mockito.*;
|
|||
|
||||
import java.util.function.BiFunction;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
|
@ -85,6 +86,7 @@ public class MergeAndGetTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void shouldBehaveProperlyForRelationAndRelation() {
|
||||
// given
|
||||
Relation a = mock(Relation.class);
|
||||
|
@ -96,7 +98,9 @@ public class MergeAndGetTest {
|
|||
// then
|
||||
Oaf x = fn.get().apply(a, b);
|
||||
assertTrue(Relation.class.isAssignableFrom(x.getClass()));
|
||||
verify(a).mergeFrom(b);
|
||||
|
||||
// TODO should be reimplemented
|
||||
// verify(a).mergeFrom(b);
|
||||
assertEquals(a, x);
|
||||
}
|
||||
|
||||
|
@ -145,6 +149,7 @@ public class MergeAndGetTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void shouldBehaveProperlyForOafEntityAndOafEntity() {
|
||||
// given
|
||||
OafEntity a = mock(OafEntity.class);
|
||||
|
@ -156,7 +161,9 @@ public class MergeAndGetTest {
|
|||
// then
|
||||
Oaf x = fn.get().apply(a, b);
|
||||
assertTrue(OafEntity.class.isAssignableFrom(x.getClass()));
|
||||
verify(a).mergeFrom(b);
|
||||
|
||||
// TODO should be reimplemented
|
||||
// verify(a).mergeFrom(b);
|
||||
assertEquals(a, x);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,6 +67,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
final String openapcInputPath = parser.get("openapcInputPath");
|
||||
log.info("openapcInputPath: {}", openapcInputPath);
|
||||
|
||||
final String dataciteInputPath = parser.get("dataciteInputPath");
|
||||
log.info("dataciteInputPath: {}", dataciteInputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
|
@ -93,9 +96,15 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
||||
spark, openapcInputPath, collectedFromOpenAPC);
|
||||
|
||||
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||
spark, dataciteInputPath, collectedFromDatacite);
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
.union(openAPCRelations)
|
||||
.union(dataciteRelations)
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
|
||||
|
|
|
@ -95,7 +95,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
|
||||
return projectScores.map((MapFunction<BipProjectModel, Project>) bipProjectScores -> {
|
||||
Project project = new Project();
|
||||
//project.setId(bipProjectScores.getProjectId());
|
||||
project.setId(bipProjectScores.getProjectId());
|
||||
project.setMeasures(bipProjectScores.toMeasures());
|
||||
return project;
|
||||
}, Encoders.bean(Project.class))
|
||||
|
|
|
@ -38,7 +38,6 @@ public class BipProjectModel {
|
|||
return projectId;
|
||||
}
|
||||
|
||||
|
||||
// each project bip measure has exactly one value, hence one key-value pair
|
||||
private Measure createMeasure(String measureId, String measureValue) {
|
||||
|
||||
|
|
|
@ -80,9 +80,11 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
|
||||
fosDataset
|
||||
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
|
||||
return getResult(ModelSupport.getIdPrefix(Result.class) + "|" + k, it);
|
||||
}, Encoders.bean(Result.class))
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, FOSDataModel, Result>) (k,
|
||||
it) -> getResult(
|
||||
ModelSupport.entityIdPrefix.get(Result.class.getSimpleName().toLowerCase()) + "|" + k, it),
|
||||
Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
@ -113,19 +115,7 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
.forEach(
|
||||
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
|
||||
r.setSubject(sbjs);
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
|
|
@ -81,19 +81,7 @@ public class PrepareSDGSparkJob implements Serializable {
|
|||
s -> sbjs
|
||||
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
|
||||
r.setSubject(sbjs);
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
|
|
|
@ -88,7 +88,7 @@ public class CreateActionSetSparkJob implements Serializable {
|
|||
private static void extractContent(SparkSession spark, String inputPath, String outputPath) {
|
||||
|
||||
getTextTextJavaPairRDD(spark, inputPath)
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);// , GzipCodec.class);
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(SparkSession spark, String inputPath) {
|
||||
|
|
|
@ -166,6 +166,6 @@ public class MapOCIdsInPids implements Serializable {
|
|||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,7 +1,26 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.project;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme;
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.model.JsonTopic;
|
||||
|
@ -15,25 +34,8 @@ import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
/**
|
||||
* Class that makes the ActionSet. To prepare the AS two joins are needed
|
||||
*
|
||||
|
|
|
@ -79,35 +79,36 @@ public class CreateActionSetSparkJob implements Serializable {
|
|||
|
||||
private static void createActionSet(SparkSession spark, String inputPath, String outputPath) {
|
||||
JavaRDD<AtomicAction> relations = spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map(
|
||||
(MapFunction<String, TransformativeAgreementModel>) value -> OBJECT_MAPPER
|
||||
.readValue(value, TransformativeAgreementModel.class),
|
||||
Encoders.bean(TransformativeAgreementModel.class))
|
||||
.flatMap(
|
||||
(FlatMapFunction<TransformativeAgreementModel, Relation>) value -> createRelation(
|
||||
value)
|
||||
.iterator(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter((FilterFunction<Relation>) Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p));
|
||||
//TODO relations in stand-by waiting to know if we need to create them or not In case we need just make a union before saving the sequence file
|
||||
spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map(
|
||||
(MapFunction<String, TransformativeAgreementModel>) value -> OBJECT_MAPPER
|
||||
.readValue(value, TransformativeAgreementModel.class),
|
||||
Encoders.bean(TransformativeAgreementModel.class))
|
||||
.map(
|
||||
(MapFunction<TransformativeAgreementModel, Result>) value -> createResult(
|
||||
value),
|
||||
Encoders.bean(Result.class))
|
||||
.filter((FilterFunction<Result>) r -> r != null)
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p))
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map(
|
||||
(MapFunction<String, TransformativeAgreementModel>) value -> OBJECT_MAPPER
|
||||
.readValue(value, TransformativeAgreementModel.class),
|
||||
Encoders.bean(TransformativeAgreementModel.class))
|
||||
.flatMap(
|
||||
(FlatMapFunction<TransformativeAgreementModel, Relation>) value -> createRelation(
|
||||
value)
|
||||
.iterator(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter((FilterFunction<Relation>) Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p));
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map(
|
||||
(MapFunction<String, TransformativeAgreementModel>) value -> OBJECT_MAPPER
|
||||
.readValue(value, TransformativeAgreementModel.class),
|
||||
Encoders.bean(TransformativeAgreementModel.class))
|
||||
.map(
|
||||
(MapFunction<TransformativeAgreementModel, Result>) value -> createResult(
|
||||
value),
|
||||
Encoders.bean(Result.class))
|
||||
.filter((FilterFunction<Result>) r -> r != null)
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p))
|
||||
.union(relations)
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
|
|
|
@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
|||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
@ -13,7 +14,9 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -68,18 +71,59 @@ public class SparkAtomicActionUsageJob implements Serializable {
|
|||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
|
||||
final String datasourcePath = parser.get("datasourcePath");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
prepareData(dbname, spark, workingPath + "/usageDb", "usage_stats", "result_id");
|
||||
prepareResultData(
|
||||
dbname, spark, workingPath + "/usageDb",
|
||||
"usage_stats",
|
||||
"result_id",
|
||||
"repository_id",
|
||||
datasourcePath);
|
||||
prepareData(dbname, spark, workingPath + "/projectDb", "project_stats", "id");
|
||||
prepareData(dbname, spark, workingPath + "/datasourceDb", "datasource_stats", "repository_id");
|
||||
writeActionSet(spark, workingPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void prepareResultData(String dbname, SparkSession spark, String workingPath, String tableName,
|
||||
String resultAttributeName, String datasourceAttributeName,
|
||||
String datasourcePath) {
|
||||
Dataset<UsageStatsResultModel> resultModel = spark
|
||||
.sql(
|
||||
String
|
||||
.format(
|
||||
"select %s as id, %s as datasourceId, sum(downloads) as downloads, sum(views) as views " +
|
||||
"from %s.%s group by %s, %s",
|
||||
resultAttributeName, datasourceAttributeName, dbname, tableName, resultAttributeName,
|
||||
datasourceAttributeName))
|
||||
.as(Encoders.bean(UsageStatsResultModel.class));
|
||||
Dataset<Datasource> datasource = readPath(spark, datasourcePath, Datasource.class)
|
||||
.filter((FilterFunction<Datasource>) d -> !d.getDataInfo().getDeletedbyinference())
|
||||
.map((MapFunction<Datasource, Datasource>) d -> {
|
||||
d.setId(d.getId().substring(3));
|
||||
return d;
|
||||
}, Encoders.bean(Datasource.class));
|
||||
resultModel
|
||||
.joinWith(datasource, resultModel.col("datasourceId").equalTo(datasource.col("id")), "left")
|
||||
.map((MapFunction<Tuple2<UsageStatsResultModel, Datasource>, UsageStatsResultModel>) t2 -> {
|
||||
UsageStatsResultModel usrm = t2._1();
|
||||
if (Optional.ofNullable(t2._2()).isPresent())
|
||||
usrm.setDatasourceId(usrm.getDatasourceId() + "||" + t2._2().getOfficialname().getValue());
|
||||
else
|
||||
usrm.setDatasourceId(usrm.getDatasourceId() + "||NO_MATCH_FOUND");
|
||||
return usrm;
|
||||
}, Encoders.bean(UsageStatsResultModel.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath);
|
||||
}
|
||||
|
||||
private static void prepareData(String dbname, SparkSession spark, String workingPath, String tableName,
|
||||
String attribute_name) {
|
||||
spark
|
||||
|
@ -115,15 +159,62 @@ public class SparkAtomicActionUsageJob implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
public static Measure newMeasureInstance(String id) {
|
||||
Measure m = new Measure();
|
||||
m.setId(id);
|
||||
m.setUnit(new ArrayList<>());
|
||||
return m;
|
||||
}
|
||||
|
||||
private static Dataset<Result> getFinalIndicatorsResult(SparkSession spark, String inputPath) {
|
||||
|
||||
return readPath(spark, inputPath, UsageStatsModel.class)
|
||||
.map((MapFunction<UsageStatsModel, Result>) usm -> {
|
||||
return readPath(spark, inputPath, UsageStatsResultModel.class)
|
||||
.groupByKey((MapFunction<UsageStatsResultModel, String>) usm -> usm.getId(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, UsageStatsResultModel, Result>) (k, it) -> {
|
||||
Result r = new Result();
|
||||
r.setId("50|" + usm.getId());
|
||||
r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
|
||||
r.setId("50|" + k);
|
||||
// id = download or view and unit = list of key value pairs
|
||||
Measure download = newMeasureInstance("downloads");
|
||||
Measure view = newMeasureInstance("views");
|
||||
UsageStatsResultModel first = it.next();
|
||||
addCountForDatasource(download, first, view);
|
||||
it.forEachRemaining(usm -> {
|
||||
addCountForDatasource(download, usm, view);
|
||||
});
|
||||
r.setMeasures(Arrays.asList(download, view));
|
||||
return r;
|
||||
}, Encoders.bean(Result.class));
|
||||
}, Encoders.bean(Result.class))
|
||||
// .map((MapFunction<UsageStatsResultModel, Result>) usm -> {
|
||||
// Result r = new Result();
|
||||
// r.setId("50|" + usm.getId());
|
||||
// r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
|
||||
// return r;
|
||||
// }, Encoders.bean(Result.class));
|
||||
;
|
||||
}
|
||||
|
||||
private static void addCountForDatasource(Measure download, UsageStatsResultModel usm, Measure view) {
|
||||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"");
|
||||
download
|
||||
.getUnit()
|
||||
.add(
|
||||
OafMapperUtils
|
||||
.newKeyValueInstance(usm.getDatasourceId(), String.valueOf(usm.getDownloads()), dataInfo));
|
||||
view
|
||||
.getUnit()
|
||||
.add(OafMapperUtils.newKeyValueInstance(usm.getDatasourceId(), String.valueOf(usm.getViews()), dataInfo));
|
||||
}
|
||||
|
||||
private static Dataset<Project> getFinalIndicatorsProject(SparkSession spark, String inputPath) {
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.usagestats;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 30/06/23
|
||||
*/
|
||||
public class UsageStatsResultModel extends UsageStatsModel {
|
||||
private String datasourceId;
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,257 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 18/04/24
|
||||
*/
|
||||
public class CreateActionSetFromWebEntries implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(CreateActionSetFromWebEntries.class);
|
||||
private static final String DOI_PREFIX = "50|doi_________::";
|
||||
|
||||
private static final String ROR_PREFIX = "20|ror_________::";
|
||||
|
||||
private static final String PMID_PREFIX = "50|pmid________::";
|
||||
|
||||
private static final String PMCID_PREFIX = "50|pmc_________::";
|
||||
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
|
||||
private static final String WEB_CRAWL_NAME = "Web Crawl";
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CreateActionSetFromWebEntries.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String blackListInputPath = parser.get("blackListPath");
|
||||
log.info("blackListInputPath: {}", blackListInputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
|
||||
createActionSet(spark, inputPath, outputPath, blackListInputPath);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
public static void createActionSet(SparkSession spark, String inputPath,
|
||||
String outputPath, String blackListInputPath) {
|
||||
|
||||
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
|
||||
.filter("country_code=='IE'")
|
||||
.drop("publication_year");
|
||||
|
||||
final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
|
||||
|
||||
dataset
|
||||
.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
|
||||
.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
|
||||
.drop("OpenAlexId")
|
||||
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||
List<Relation> ret = new ArrayList<>();
|
||||
final String ror = ROR_PREFIX
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
|
||||
ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
|
||||
// ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
|
||||
// ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
|
||||
|
||||
return ret
|
||||
.iterator();
|
||||
}, Encoders.bean(Relation.class))
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
|
||||
}
|
||||
|
||||
private static Dataset<Row> readWebCrawl(SparkSession spark, String inputPath) {
|
||||
StructType webInfo = StructType
|
||||
.fromDDL(
|
||||
"`id` STRING , `doi` STRING, `ids` STRUCT<`pmid` :STRING, `pmcid`: STRING >, `publication_year` STRING, "
|
||||
+
|
||||
"`authorships` ARRAY<STRUCT <`institutions`: ARRAY <STRUCT <`ror`: STRING, `country_code` :STRING>>>>");
|
||||
|
||||
return spark
|
||||
.read()
|
||||
.schema(webInfo)
|
||||
.json(inputPath)
|
||||
.withColumn(
|
||||
"authors", functions
|
||||
.explode(
|
||||
functions.col("authorships")))
|
||||
.selectExpr("id", "doi", "ids", "publication_year", "authors.institutions as institutions")
|
||||
.withColumn(
|
||||
"institution", functions
|
||||
.explode(
|
||||
functions.col("institutions")))
|
||||
|
||||
.selectExpr(
|
||||
"id", "doi", "institution.ror as ror",
|
||||
"institution.country_code as country_code", "publication_year")
|
||||
.distinct();
|
||||
|
||||
// .selectExpr(
|
||||
// "id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
|
||||
// "institution.country_code as country_code", "publication_year")
|
||||
// .distinct();
|
||||
|
||||
}
|
||||
|
||||
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
|
||||
|
||||
return spark
|
||||
.read()
|
||||
.option("header", true)
|
||||
.csv(inputPath)
|
||||
.select("OpenAlexId");
|
||||
}
|
||||
|
||||
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
|
||||
if (pmcid == null)
|
||||
return new ArrayList<>();
|
||||
|
||||
return createAffiliatioRelationPair(
|
||||
PMCID_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC", pmcid))),
|
||||
ror);
|
||||
}
|
||||
|
||||
private static List<Relation> createAffiliationRelationPairPMID(String pmid, String ror) {
|
||||
if (pmid == null)
|
||||
return new ArrayList<>();
|
||||
|
||||
return createAffiliatioRelationPair(
|
||||
PMID_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), removeResolver("PMID", pmid))),
|
||||
ror);
|
||||
}
|
||||
|
||||
private static String removeResolver(String pidType, String pid) {
|
||||
switch (pidType) {
|
||||
case "PMID":
|
||||
return pid.substring(33);
|
||||
case "PMC":
|
||||
return "PMC" + pid.substring(43);
|
||||
case "DOI":
|
||||
return pid.substring(16);
|
||||
}
|
||||
|
||||
throw new RuntimeException();
|
||||
|
||||
}
|
||||
|
||||
private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {
|
||||
if (doi == null)
|
||||
return new ArrayList<>();
|
||||
|
||||
return createAffiliatioRelationPair(
|
||||
DOI_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), removeResolver("DOI", doi))),
|
||||
ror);
|
||||
|
||||
}
|
||||
|
||||
private static List<Relation> createAffiliatioRelationPair(String resultId, String orgId) {
|
||||
ArrayList<Relation> newRelations = new ArrayList();
|
||||
|
||||
newRelations
|
||||
.add(
|
||||
OafMapperUtils
|
||||
.getRelation(
|
||||
orgId, resultId, ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION,
|
||||
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
||||
Arrays
|
||||
.asList(
|
||||
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"sysimport:crasswalk:webcrawl", "Imported from Webcrawl",
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.9"),
|
||||
null));
|
||||
|
||||
newRelations
|
||||
.add(
|
||||
OafMapperUtils
|
||||
.getRelation(
|
||||
resultId, orgId, ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION,
|
||||
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
||||
Arrays
|
||||
.asList(
|
||||
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"sysimport:crasswalk:webcrawl", "Imported from Webcrawl",
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.9"),
|
||||
null));
|
||||
|
||||
return newRelations;
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -18,7 +18,11 @@ import javax.xml.transform.TransformerConfigurationException;
|
|||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.xpath.*;
|
||||
import javax.xml.xpath.XPath;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import javax.xml.xpath.XPathFactory;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -35,7 +39,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
|
|||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
|
||||
/**
|
||||
* log.info(...) equal to log.trace(...) in the application-logs
|
||||
* log.info(...) equal to log.trace(...) in the application-logs
|
||||
* <p>
|
||||
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
|
||||
*
|
||||
|
@ -47,6 +51,7 @@ public class RestIterator implements Iterator<String> {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
||||
public static final String UTF_8 = "UTF-8";
|
||||
private static final int MAX_ATTEMPTS = 5;
|
||||
|
||||
private final HttpClientParams clientParams;
|
||||
|
||||
|
@ -60,8 +65,9 @@ public class RestIterator implements Iterator<String> {
|
|||
private final int resultSizeValue;
|
||||
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
||||
private int resultTotal = -1;
|
||||
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
|
||||
// or token scanned from results)
|
||||
private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
|
||||
// harvest
|
||||
// or token scanned from results)
|
||||
private InputStream resultStream;
|
||||
private Transformer transformer;
|
||||
private XPath xpath;
|
||||
|
@ -73,7 +79,7 @@ public class RestIterator implements Iterator<String> {
|
|||
private final String querySize;
|
||||
private final String authMethod;
|
||||
private final String authToken;
|
||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
||||
private int discoverResultSize = 0;
|
||||
private int pagination = 1;
|
||||
/*
|
||||
|
@ -83,8 +89,8 @@ public class RestIterator implements Iterator<String> {
|
|||
*/
|
||||
private final String resultOutputFormat;
|
||||
|
||||
/** RestIterator class
|
||||
* compatible to version 1.3.33
|
||||
/**
|
||||
* RestIterator class compatible to version 1.3.33
|
||||
*/
|
||||
public RestIterator(
|
||||
final HttpClientParams clientParams,
|
||||
|
@ -108,40 +114,42 @@ public class RestIterator implements Iterator<String> {
|
|||
this.resumptionType = resumptionType;
|
||||
this.resumptionParam = resumptionParam;
|
||||
this.resultFormatValue = resultFormatValue;
|
||||
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
|
||||
this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
|
||||
this.queryParams = queryParams;
|
||||
this.authMethod = authMethod;
|
||||
this.authToken = authToken;
|
||||
this.resultOutputFormat = resultOutputFormat;
|
||||
|
||||
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||
: "";
|
||||
this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
|
||||
: "";
|
||||
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
|
||||
|
||||
try {
|
||||
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
||||
} catch (Exception e) {
|
||||
} catch (final Exception e) {
|
||||
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
||||
}
|
||||
|
||||
initQueue();
|
||||
}
|
||||
|
||||
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
|
||||
private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
|
||||
final String entityXpath)
|
||||
throws TransformerConfigurationException, XPathExpressionException {
|
||||
final TransformerFactory factory = TransformerFactory.newInstance();
|
||||
transformer = factory.newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||
xpath = XPathFactory.newInstance().newXPath();
|
||||
xprResultTotalPath = xpath.compile(resultTotalXpath);
|
||||
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||
xprEntity = xpath.compile(entityXpath);
|
||||
this.transformer = factory.newTransformer();
|
||||
this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||
this.xpath = XPathFactory.newInstance().newXPath();
|
||||
this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
|
||||
this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||
this.xprEntity = this.xpath.compile(entityXpath);
|
||||
}
|
||||
|
||||
private void initQueue() {
|
||||
query = baseUrl + "?" + queryParams + querySize + queryFormat;
|
||||
log.info("REST calls starting with {}", query);
|
||||
this.query = this.baseUrl + "?" + this.queryParams + this.querySize + this.queryFormat;
|
||||
log.info("REST calls starting with {}", this.query);
|
||||
}
|
||||
|
||||
private void disconnect() {
|
||||
|
@ -154,12 +162,11 @@ public class RestIterator implements Iterator<String> {
|
|||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (recordQueue.isEmpty() && query.isEmpty()) {
|
||||
if (this.recordQueue.isEmpty() && this.query.isEmpty()) {
|
||||
disconnect();
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -168,214 +175,241 @@ public class RestIterator implements Iterator<String> {
|
|||
*/
|
||||
@Override
|
||||
public String next() {
|
||||
synchronized (recordQueue) {
|
||||
while (recordQueue.isEmpty() && !query.isEmpty()) {
|
||||
synchronized (this.recordQueue) {
|
||||
while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
|
||||
try {
|
||||
query = downloadPage(query);
|
||||
} catch (CollectorException e) {
|
||||
this.query = downloadPage(this.query, 0);
|
||||
} catch (final CollectorException e) {
|
||||
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return recordQueue.poll();
|
||||
return this.recordQueue.poll();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* download page and return nextQuery
|
||||
* download page and return nextQuery (with number of attempt)
|
||||
*/
|
||||
private String downloadPage(String query) throws CollectorException {
|
||||
String resultJson;
|
||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||
String nextQuery = "";
|
||||
String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
||||
Node resultNode = null;
|
||||
NodeList nodeList = null;
|
||||
String qUrlArgument = "";
|
||||
int urlOldResumptionSize = 0;
|
||||
InputStream theHttpInputStream;
|
||||
private String downloadPage(String query, final int attempt) throws CollectorException {
|
||||
|
||||
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
||||
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
|
||||
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
|
||||
if (!query.contains("&cursor=")) {
|
||||
query += "&cursor=*";
|
||||
if (attempt > MAX_ATTEMPTS) {
|
||||
throw new CollectorException("Max Number of attempts reached, query:" + query);
|
||||
}
|
||||
|
||||
if (attempt > 0) {
|
||||
final int delay = (attempt * 5000);
|
||||
log.debug("Attempt {} with delay {}", attempt, delay);
|
||||
try {
|
||||
Thread.sleep(delay);
|
||||
} catch (final InterruptedException e) {
|
||||
new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
log.info("requestig URL [{}]", query);
|
||||
String resultJson;
|
||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||
String nextQuery = "";
|
||||
final String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
||||
Node resultNode = null;
|
||||
NodeList nodeList = null;
|
||||
String qUrlArgument = "";
|
||||
int urlOldResumptionSize = 0;
|
||||
InputStream theHttpInputStream;
|
||||
|
||||
URL qUrl = new URL(query);
|
||||
log.debug("authMethod: {}", authMethod);
|
||||
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: {}", resultXml);
|
||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: {}", resultXml);
|
||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
|
||||
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else {
|
||||
theHttpInputStream = qUrl.openStream();
|
||||
}
|
||||
|
||||
resultStream = theHttpInputStream;
|
||||
if ("json".equals(resultOutputFormat)) {
|
||||
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
|
||||
resultXml = JsonUtils.convertToXML(resultJson);
|
||||
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||
}
|
||||
|
||||
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
||||
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
|
||||
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||
log.debug("nodeList.length: {}", nodeList.getLength());
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
StringWriter sw = new StringWriter();
|
||||
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||
String toEnqueue = sw.toString();
|
||||
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
|
||||
log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
|
||||
} else {
|
||||
recordQueue.add(sw.toString());
|
||||
}
|
||||
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
||||
if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
|
||||
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
|
||||
if (!query.contains("&cursor=")) {
|
||||
query += "&cursor=*";
|
||||
}
|
||||
} else {
|
||||
log.warn("resultXml is equal with emptyXml");
|
||||
}
|
||||
|
||||
resumptionInt += resultSizeValue;
|
||||
try {
|
||||
log.info("requesting URL [{}]", query);
|
||||
|
||||
switch (resumptionType.toLowerCase()) {
|
||||
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||
resumptionStr = xprResumptionPath.evaluate(resultNode);
|
||||
break;
|
||||
final URL qUrl = new URL(query);
|
||||
log.debug("authMethod: {}", this.authMethod);
|
||||
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: {}", resultXml);
|
||||
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken);
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else if (this.BASIC.equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: {}", resultXml);
|
||||
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
|
||||
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else {
|
||||
theHttpInputStream = qUrl.openStream();
|
||||
}
|
||||
|
||||
case "count": // begin at one step for all records, iterate over items
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
break;
|
||||
this.resultStream = theHttpInputStream;
|
||||
if ("json".equals(this.resultOutputFormat)) {
|
||||
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
|
||||
resultXml = JsonUtils.convertToXML(resultJson);
|
||||
this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||
}
|
||||
|
||||
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||
if (resultSizeValue < 2) {
|
||||
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
||||
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
||||
resultNode = (Node) this.xpath
|
||||
.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
|
||||
nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||
log.debug("nodeList.length: {}", nodeList.getLength());
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||
final String toEnqueue = sw.toString();
|
||||
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue)
|
||||
|| emptyXml.equalsIgnoreCase(toEnqueue)) {
|
||||
log
|
||||
.warn(
|
||||
"The following record resulted in empty item for the feeding queue: {}", resultXml);
|
||||
} else {
|
||||
this.recordQueue.add(sw.toString());
|
||||
}
|
||||
}
|
||||
qUrlArgument = qUrl.getQuery();
|
||||
String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||
for (String arrayUrlArgStr : arrayQUrlArgument) {
|
||||
if (arrayUrlArgStr.startsWith(resumptionParam)) {
|
||||
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||
if (isInteger(resumptionKeyValue[1])) {
|
||||
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
|
||||
} else {
|
||||
log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
|
||||
} else {
|
||||
log.warn("resultXml is equal with emptyXml");
|
||||
}
|
||||
|
||||
this.resumptionInt += this.resultSizeValue;
|
||||
|
||||
switch (this.resumptionType.toLowerCase()) {
|
||||
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||
this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
|
||||
break;
|
||||
|
||||
case "count": // begin at one step for all records, iterate over items
|
||||
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||
break;
|
||||
|
||||
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||
if (this.resultSizeValue < 2) {
|
||||
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
||||
}
|
||||
qUrlArgument = qUrl.getQuery();
|
||||
final String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||
for (final String arrayUrlArgStr : arrayQUrlArgument) {
|
||||
if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
|
||||
final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||
if (isInteger(resumptionKeyValue[1])) {
|
||||
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
|
||||
} else {
|
||||
log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (((emptyXml).equalsIgnoreCase(resultXml))
|
||||
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
|
||||
// resumptionStr = "";
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
if (((emptyXml).equalsIgnoreCase(resultXml))
|
||||
|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
|
||||
// resumptionStr = "";
|
||||
if (nodeList != null) {
|
||||
this.discoverResultSize += nodeList.getLength();
|
||||
}
|
||||
this.resultTotal = this.discoverResultSize;
|
||||
} else {
|
||||
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||
this.resultTotal = this.resumptionInt + 1;
|
||||
if (nodeList != null) {
|
||||
this.discoverResultSize += nodeList.getLength();
|
||||
}
|
||||
}
|
||||
resultTotal = discoverResultSize;
|
||||
} else {
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
resultTotal = resumptionInt + 1;
|
||||
log.info("discoverResultSize: {}", this.discoverResultSize);
|
||||
break;
|
||||
|
||||
case "pagination":
|
||||
case "page": // pagination, iterate over page numbers
|
||||
this.pagination += 1;
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
this.discoverResultSize += nodeList.getLength();
|
||||
} else {
|
||||
this.resultTotal = this.discoverResultSize;
|
||||
this.pagination = this.discoverResultSize;
|
||||
}
|
||||
}
|
||||
log.info("discoverResultSize: {}", discoverResultSize);
|
||||
break;
|
||||
this.resumptionInt = this.pagination;
|
||||
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||
break;
|
||||
|
||||
case "pagination":
|
||||
case "page": // pagination, iterate over page numbers
|
||||
pagination += 1;
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
} else {
|
||||
resultTotal = discoverResultSize;
|
||||
pagination = discoverResultSize;
|
||||
}
|
||||
resumptionInt = pagination;
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
break;
|
||||
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
|
||||
// in
|
||||
// solr)
|
||||
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
||||
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
||||
|
||||
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
|
||||
// solr)
|
||||
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
||||
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
||||
this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
|
||||
this.queryParams = this.queryParams.replace("&cursor=*", "");
|
||||
|
||||
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
|
||||
queryParams = queryParams.replace("&cursor=*", "");
|
||||
// terminating if length of nodeList is 0
|
||||
if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
|
||||
this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
|
||||
} else {
|
||||
this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
|
||||
// resultSizeValue
|
||||
// because the iteration is over
|
||||
// real length and the
|
||||
// resultSizeValue is added before
|
||||
// the switch()
|
||||
}
|
||||
|
||||
// terminating if length of nodeList is 0
|
||||
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
|
||||
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
|
||||
} else {
|
||||
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
|
||||
// because the iteration is over
|
||||
// real length and the
|
||||
// resultSizeValue is added before
|
||||
// the switch()
|
||||
}
|
||||
this.discoverResultSize = nodeList.getLength();
|
||||
|
||||
discoverResultSize = nodeList.getLength();
|
||||
log
|
||||
.debug(
|
||||
"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
|
||||
+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);
|
||||
|
||||
log
|
||||
.debug(
|
||||
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
|
||||
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
|
||||
break;
|
||||
|
||||
break;
|
||||
default: // otherwise: abort
|
||||
// resultTotal = resumptionInt;
|
||||
break;
|
||||
}
|
||||
|
||||
default: // otherwise: abort
|
||||
// resultTotal = resumptionInt;
|
||||
break;
|
||||
} catch (final Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||
}
|
||||
|
||||
try {
|
||||
if (resultTotal == -1) {
|
||||
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
|
||||
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
|
||||
resultTotal += 1;
|
||||
} // to correct the upper bound
|
||||
log.info("resultTotal was -1 is now: " + resultTotal);
|
||||
try {
|
||||
if (this.resultTotal == -1) {
|
||||
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
|
||||
if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) {
|
||||
this.resultTotal += 1;
|
||||
} // to correct the upper bound
|
||||
log.info("resultTotal was -1 is now: " + this.resultTotal);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
||||
log.debug("resultTotal: " + this.resultTotal);
|
||||
log.debug("resInt: " + this.resumptionInt);
|
||||
if (this.resumptionInt <= this.resultTotal) {
|
||||
nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
|
||||
+ this.resumptionStr
|
||||
+ this.queryFormat;
|
||||
} else {
|
||||
nextQuery = "";
|
||||
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
||||
// resumptionInt and prevent a NullPointer Exception at mdStore
|
||||
}
|
||||
log.debug("nextQueryUrl: " + nextQuery);
|
||||
return nextQuery;
|
||||
} catch (final Throwable e) {
|
||||
log.warn(e.getMessage(), e);
|
||||
return downloadPage(query, attempt + 1);
|
||||
}
|
||||
log.debug("resultTotal: " + resultTotal);
|
||||
log.debug("resInt: " + resumptionInt);
|
||||
if (resumptionInt <= resultTotal) {
|
||||
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
|
||||
+ queryFormat;
|
||||
} else {
|
||||
nextQuery = "";
|
||||
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
||||
// resumptionInt and prevent a NullPointer Exception at mdStore
|
||||
}
|
||||
log.debug("nextQueryUrl: " + nextQuery);
|
||||
return nextQuery;
|
||||
|
||||
}
|
||||
|
||||
private boolean isInteger(String s) {
|
||||
private boolean isInteger(final String s) {
|
||||
boolean isValidInteger = false;
|
||||
try {
|
||||
Integer.parseInt(s);
|
||||
|
@ -383,7 +417,7 @@ public class RestIterator implements Iterator<String> {
|
|||
// s is a valid integer
|
||||
|
||||
isValidInteger = true;
|
||||
} catch (NumberFormatException ex) {
|
||||
} catch (final NumberFormatException ex) {
|
||||
// s is not an integer
|
||||
}
|
||||
|
||||
|
@ -391,20 +425,20 @@ public class RestIterator implements Iterator<String> {
|
|||
}
|
||||
|
||||
// Method to encode a string value using `UTF-8` encoding scheme
|
||||
private String encodeValue(String value) {
|
||||
private String encodeValue(final String value) {
|
||||
try {
|
||||
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
|
||||
} catch (UnsupportedEncodingException ex) {
|
||||
} catch (final UnsupportedEncodingException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
public String getResultFormatValue() {
|
||||
return resultFormatValue;
|
||||
return this.resultFormatValue;
|
||||
}
|
||||
|
||||
public String getResultOutputFormat() {
|
||||
return resultOutputFormat;
|
||||
return this.resultOutputFormat;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -49,29 +49,29 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
|
|||
public MetadataRecord call(MetadataRecord value) {
|
||||
aggregationCounter.getTotalItems().add(1);
|
||||
|
||||
Processor processor = new Processor(false);
|
||||
Processor processor = new Processor(false);
|
||||
|
||||
processor.registerExtensionFunction(cleanFunction);
|
||||
processor.registerExtensionFunction(new DateCleaner());
|
||||
processor.registerExtensionFunction(new PersonCleaner());
|
||||
processor.registerExtensionFunction(cleanFunction);
|
||||
processor.registerExtensionFunction(new DateCleaner());
|
||||
processor.registerExtensionFunction(new PersonCleaner());
|
||||
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
QName datasourceIDParam = new QName(DATASOURCE_ID_PARAM);
|
||||
comp.setParameter(datasourceIDParam, new XdmAtomicValue(value.getProvenance().getDatasourceId()));
|
||||
QName datasourceNameParam = new QName(DATASOURCE_NAME_PARAM);
|
||||
comp.setParameter(datasourceNameParam, new XdmAtomicValue(value.getProvenance().getDatasourceName()));
|
||||
XsltExecutable xslt;
|
||||
XdmNode source;
|
||||
try {
|
||||
xslt = comp
|
||||
.compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)));
|
||||
source = processor
|
||||
.newDocumentBuilder()
|
||||
.build(new StreamSource(IOUtils.toInputStream(value.getBody(), StandardCharsets.UTF_8)));
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("Error on parsing xslt", e);
|
||||
}
|
||||
try {
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
QName datasourceIDParam = new QName(DATASOURCE_ID_PARAM);
|
||||
comp.setParameter(datasourceIDParam, new XdmAtomicValue(value.getProvenance().getDatasourceId()));
|
||||
QName datasourceNameParam = new QName(DATASOURCE_NAME_PARAM);
|
||||
comp.setParameter(datasourceNameParam, new XdmAtomicValue(value.getProvenance().getDatasourceName()));
|
||||
XsltExecutable xslt;
|
||||
XdmNode source;
|
||||
try {
|
||||
xslt = comp
|
||||
.compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)));
|
||||
source = processor
|
||||
.newDocumentBuilder()
|
||||
.build(new StreamSource(IOUtils.toInputStream(value.getBody(), StandardCharsets.UTF_8)));
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("Error on parsing xslt", e);
|
||||
}
|
||||
try {
|
||||
XsltTransformer trans = xslt.load();
|
||||
trans.setInitialContextNode(source);
|
||||
final StringWriter output = new StringWriter();
|
||||
|
|
|
@ -23,6 +23,12 @@
|
|||
"paramDescription": "the path to get the input data from OpenAPC",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dip",
|
||||
"paramLongName": "dataciteInputPath",
|
||||
"paramDescription": "the path to get the input data from Datacite",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
|
|
|
@ -34,4 +34,6 @@ oozie.wf.application.path=${oozieTopWfApplicationPath}
|
|||
crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
||||
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
||||
openapcInputPath=/data/bip-affiliations/openapc-data.json
|
||||
dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
||||
|
||||
outputPath=/tmp/crossref-affiliations-output-v5
|
||||
|
|
|
@ -13,6 +13,10 @@
|
|||
<name>openapcInputPath</name>
|
||||
<description>the path where to find the inferred affiliation relations from OpenAPC</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dataciteInputPath</name>
|
||||
<description>the path where to find the inferred affiliation relations from Datacite</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
|
@ -107,6 +111,8 @@
|
|||
<arg>--crossrefInputPath</arg><arg>${crossrefInputPath}</arg>
|
||||
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
|
||||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -28,5 +28,11 @@
|
|||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workingPath where to save the content of the usage_stats table",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dp",
|
||||
"paramLongName": "datasourcePath",
|
||||
"paramDescription": "the workingPath where to save the content of the usage_stats table",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -90,6 +90,7 @@
|
|||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--usagestatsdb</arg><arg>${usagestatsdb}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}</arg>
|
||||
<arg>--datasourcePath</arg><arg>${datasourcePath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
[
|
||||
{
|
||||
"paramName": "sp",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the zipped opencitations file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},{
|
||||
"paramName": "bl",
|
||||
"paramLongName": "blackListPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,3 @@
|
|||
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
||||
outputPath=/tmp/miriam/webcrawlComplete/
|
||||
blackListPath=/user/miriam.baglioni/openalex-blackList
|
|
@ -0,0 +1,58 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>6G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,54 @@
|
|||
<workflow-app name="WebCrawl Integration" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="create_actionset"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="create_actionset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the AS for WC</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.webcrawl.CreateActionSetFromWebEntries</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "The base path of Crossref DUMP",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "uw",
|
||||
"paramLongName": "unpaywallPath",
|
||||
"paramDescription": "The base path of unpaywall DUMP",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mov",
|
||||
"paramLongName": "mdstoreOutputVersion",
|
||||
"paramDescription": "The mdstore Output Version",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "the Information System Service LookUp URL",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,863 @@
|
|||
[
|
||||
{
|
||||
"id": "100007630",
|
||||
"uri": "http://dx.doi.org/10.13039/100007630",
|
||||
"name": "College of Engineering and Informatics, National University of Ireland, Galway",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100007731",
|
||||
"uri": "http://dx.doi.org/10.13039/100007731",
|
||||
"name": "Endo International",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100008099",
|
||||
"uri": "http://dx.doi.org/10.13039/100008099",
|
||||
"name": "Food Safety Authority of Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100008124",
|
||||
"uri": "http://dx.doi.org/10.13039/100008124",
|
||||
"name": "Department of Jobs, Enterprise and Innovation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100009098",
|
||||
"uri": "http://dx.doi.org/10.13039/100009098",
|
||||
"name": "Department of Foreign Affairs and Trade, Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100009099",
|
||||
"uri": "http://dx.doi.org/10.13039/100009099",
|
||||
"name": "Irish Aid",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100009770",
|
||||
"uri": "http://dx.doi.org/10.13039/100009770",
|
||||
"name": "National University of Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100009985",
|
||||
"uri": "http://dx.doi.org/10.13039/100009985",
|
||||
"name": "Parkinson's Association of Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100010399",
|
||||
"uri": "http://dx.doi.org/10.13039/100010399",
|
||||
"name": "European Society of Cataract and Refractive Surgeons",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100010414",
|
||||
"uri": "http://dx.doi.org/10.13039/100010414",
|
||||
"name": "Health Research Board",
|
||||
"synonym": [
|
||||
"501100001590", "501100023273"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "100010546",
|
||||
"uri": "http://dx.doi.org/10.13039/100010546",
|
||||
"name": "Deparment of Children and Youth Affairs, Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100010993",
|
||||
"uri": "http://dx.doi.org/10.13039/100010993",
|
||||
"name": "Irish Nephrology Society",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100011096",
|
||||
"uri": "http://dx.doi.org/10.13039/100011096",
|
||||
"name": "Jazz Pharmaceuticals",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100011396",
|
||||
"uri": "http://dx.doi.org/10.13039/100011396",
|
||||
"name": "Irish College of General Practitioners",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100012919",
|
||||
"uri": "http://dx.doi.org/10.13039/100012919",
|
||||
"name": "Epilepsy Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100012920",
|
||||
"uri": "http://dx.doi.org/10.13039/100012920",
|
||||
"name": "GLEN",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100012921",
|
||||
"uri": "http://dx.doi.org/10.13039/100012921",
|
||||
"name": "Royal College of Surgeons in Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100013029",
|
||||
"uri": "http://dx.doi.org/10.13039/100013029",
|
||||
"name": "Iris O'Brien Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100013206",
|
||||
"uri": "http://dx.doi.org/10.13039/100013206",
|
||||
"name": "Food Institutional Research Measure",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100013381",
|
||||
"uri": "http://dx.doi.org/10.13039/100013381",
|
||||
"name": "Irish Phytochemical Food Network",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100013433",
|
||||
"uri": "http://dx.doi.org/10.13039/100013433",
|
||||
"name": "Transport Infrastructure Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100013461",
|
||||
"uri": "http://dx.doi.org/10.13039/100013461",
|
||||
"name": "Arts and Disability Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100013548",
|
||||
"uri": "http://dx.doi.org/10.13039/100013548",
|
||||
"name": "Filmbase",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100013917",
|
||||
"uri": "http://dx.doi.org/10.13039/100013917",
|
||||
"name": "Society for Musicology in Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100014251",
|
||||
"uri": "http://dx.doi.org/10.13039/100014251",
|
||||
"name": "Humanities in the European Research Area",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100014364",
|
||||
"uri": "http://dx.doi.org/10.13039/100014364",
|
||||
"name": "National Children's Research Centre",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100014384",
|
||||
"uri": "http://dx.doi.org/10.13039/100014384",
|
||||
"name": "Amarin Corporation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100014902",
|
||||
"uri": "http://dx.doi.org/10.13039/100014902",
|
||||
"name": "Irish Association for Cancer Research",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015023",
|
||||
"uri": "http://dx.doi.org/10.13039/100015023",
|
||||
"name": "Ireland Funds",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015037",
|
||||
"uri": "http://dx.doi.org/10.13039/100015037",
|
||||
"name": "Simon Cumbers Media Fund",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015319",
|
||||
"uri": "http://dx.doi.org/10.13039/100015319",
|
||||
"name": "Sport Ireland Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015320",
|
||||
"uri": "http://dx.doi.org/10.13039/100015320",
|
||||
"name": "Paralympics Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015442",
|
||||
"uri": "http://dx.doi.org/10.13039/100015442",
|
||||
"name": "Global Brain Health Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015992",
|
||||
"uri": "http://dx.doi.org/10.13039/100015992",
|
||||
"name": "St. Luke's Institute of Cancer Research",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100017897",
|
||||
"uri": "http://dx.doi.org/10.13039/100017897",
|
||||
"name": "Friedreich\u2019s Ataxia Research Alliance Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100018064",
|
||||
"uri": "http://dx.doi.org/10.13039/100018064",
|
||||
"name": "Department of Tourism, Culture, Arts, Gaeltacht, Sport and Media",
|
||||
"synonym": ["100012734"]
|
||||
},
|
||||
{
|
||||
"id": "100018172",
|
||||
"uri": "http://dx.doi.org/10.13039/100018172",
|
||||
"name": "Department of the Environment, Climate and Communications",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100018175",
|
||||
"uri": "http://dx.doi.org/10.13039/100018175",
|
||||
"name": "Dairy Processing Technology Centre",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100018270",
|
||||
"uri": "http://dx.doi.org/10.13039/100018270",
|
||||
"name": "Health Service Executive",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100018529",
|
||||
"uri": "http://dx.doi.org/10.13039/100018529",
|
||||
"name": "Alkermes",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100018542",
|
||||
"uri": "http://dx.doi.org/10.13039/100018542",
|
||||
"name": "Irish Endocrine Society",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100018754",
|
||||
"uri": "http://dx.doi.org/10.13039/100018754",
|
||||
"name": "An Roinn Sl\u00e1inte",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100019428",
|
||||
"uri": "http://dx.doi.org/10.13039/100019428",
|
||||
"name": "Nabriva Therapeutics",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100019637",
|
||||
"uri": "http://dx.doi.org/10.13039/100019637",
|
||||
"name": "Horizon Therapeutics",
|
||||
"synonym": ["100012754"]
|
||||
},
|
||||
{
|
||||
"id": "100020174",
|
||||
"uri": "http://dx.doi.org/10.13039/100020174",
|
||||
"name": "Health Research Charities Ireland",
|
||||
"synonym": ["100012891"]
|
||||
},
|
||||
{
|
||||
"id": "100020202",
|
||||
"uri": "http://dx.doi.org/10.13039/100020202",
|
||||
"name": "UCD Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100020233",
|
||||
"uri": "http://dx.doi.org/10.13039/100020233",
|
||||
"name": "Ireland Canada University Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100022943",
|
||||
"uri": "http://dx.doi.org/10.13039/100022943",
|
||||
"name": "National Cancer Registry Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001581",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001581",
|
||||
"name": "Arts Council of Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001582",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001582",
|
||||
"name": "Centre for Ageing Research and Development in Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
|
||||
{
|
||||
"id": "501100001584",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001584",
|
||||
"name": "Department of Agriculture, Food and the Marine, Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001586",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001586",
|
||||
"name": "Department of Education and Skills, Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001587",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001587",
|
||||
"name": "Economic and Social Research Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001588",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001588",
|
||||
"name": "Enterprise Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001589",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001589",
|
||||
"name": "Environmental Protection Agency",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001591",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001591",
|
||||
"name": "Heritage Council",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001592",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001592",
|
||||
"name": "Higher Education Authority",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001593",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001593",
|
||||
"name": "Irish Cancer Society",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001594",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001594",
|
||||
"name": "Irish Heart Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001595",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001595",
|
||||
"name": "Irish Hospice Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001598",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001598",
|
||||
"name": "Mental Health Commission",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001600",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001600",
|
||||
"name": "Research and Education Foundation, Sligo General Hospital",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001601",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001601",
|
||||
"name": "Royal Irish Academy",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001603",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001603",
|
||||
"name": "Sustainable Energy Authority of Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001604",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001604",
|
||||
"name": "Teagasc",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001627",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001627",
|
||||
"name": "Marine Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001628",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001628",
|
||||
"name": "Central Remedial Clinic",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001629",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001629",
|
||||
"name": "Royal Dublin Society",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001630",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001630",
|
||||
"name": "Dublin Institute for Advanced Studies",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001631",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001631",
|
||||
"name": "University College Dublin",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001633",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001633",
|
||||
"name": "National University of Ireland, Maynooth",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001634",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001634",
|
||||
"name": "University of Galway",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001635",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001635",
|
||||
"name": "University of Limerick",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001636",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001636",
|
||||
"name": "University College Cork",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001637",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001637",
|
||||
"name": "Trinity College Dublin",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001638",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001638",
|
||||
"name": "Dublin City University",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100002081",
|
||||
"uri": "http://dx.doi.org/10.13039/501100002081",
|
||||
"name": "Irish Research Council",
|
||||
"synonym": ["501100001596", "501100001597"]
|
||||
},
|
||||
{
|
||||
"id": "501100002736",
|
||||
"uri": "http://dx.doi.org/10.13039/501100002736",
|
||||
"name": "Covidien",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100002755",
|
||||
"uri": "http://dx.doi.org/10.13039/501100002755",
|
||||
"name": "Brennan and Company",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100002919",
|
||||
"uri": "http://dx.doi.org/10.13039/501100002919",
|
||||
"name": "Cork Institute of Technology",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100002959",
|
||||
"uri": "http://dx.doi.org/10.13039/501100002959",
|
||||
"name": "Dublin City Council",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100003036",
|
||||
"uri": "http://dx.doi.org/10.13039/501100003036",
|
||||
"name": "Perrigo Company Charitable Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100003037",
|
||||
"uri": "http://dx.doi.org/10.13039/501100003037",
|
||||
"name": "Elan",
|
||||
"synonym": ["501100021694"]
|
||||
},
|
||||
{
|
||||
"id": "501100003496",
|
||||
"uri": "http://dx.doi.org/10.13039/501100003496",
|
||||
"name": "HeyStaks Technologies",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100003553",
|
||||
"uri": "http://dx.doi.org/10.13039/501100003553",
|
||||
"name": "Gaelic Athletic Association",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100003840",
|
||||
"uri": "http://dx.doi.org/10.13039/501100003840",
|
||||
"name": "Irish Institute of Clinical Neuroscience",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100003956",
|
||||
"uri": "http://dx.doi.org/10.13039/501100003956",
|
||||
"name": "Aspect Medical Systems",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100004162",
|
||||
"uri": "http://dx.doi.org/10.13039/501100004162",
|
||||
"name": "Meath Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100004210",
|
||||
"uri": "http://dx.doi.org/10.13039/501100004210",
|
||||
"name": "Our Lady's Children's Hospital, Crumlin",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100004321",
|
||||
"uri": "http://dx.doi.org/10.13039/501100004321",
|
||||
"name": "Shire",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100004981",
|
||||
"uri": "http://dx.doi.org/10.13039/501100004981",
|
||||
"name": "Athlone Institute of Technology",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100006518",
|
||||
"uri": "http://dx.doi.org/10.13039/501100006518",
|
||||
"name": "Department of Communications, Energy and Natural Resources, Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100006553",
|
||||
"uri": "http://dx.doi.org/10.13039/501100006553",
|
||||
"name": "Collaborative Centre for Applied Nanotechnology",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100006759",
|
||||
"uri": "http://dx.doi.org/10.13039/501100006759",
|
||||
"name": "CLARITY Centre for Sensor Web Technologies",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100009246",
|
||||
"uri": "http://dx.doi.org/10.13039/501100009246",
|
||||
"name": "Technological University Dublin",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100009315",
|
||||
"uri": "http://dx.doi.org/10.13039/501100009315",
|
||||
"name": "Cystinosis Ireland",
|
||||
"synonym": ["501100001583"]
|
||||
},
|
||||
{
|
||||
"id": "501100010808",
|
||||
"uri": "http://dx.doi.org/10.13039/501100010808",
|
||||
"name": "Geological Survey of Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100011030",
|
||||
"uri": "http://dx.doi.org/10.13039/501100011030",
|
||||
"name": "Alimentary Glycoscience Research Cluster",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100011031",
|
||||
"uri": "http://dx.doi.org/10.13039/501100011031",
|
||||
"name": "Alimentary Health",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100012354",
|
||||
"uri": "http://dx.doi.org/10.13039/501100012354",
|
||||
"name": "Inland Fisheries Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100014384",
|
||||
"uri": "http://dx.doi.org/10.13039/501100014384",
|
||||
"name": "X-Bolt Orthopaedics",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100014710",
|
||||
"uri": "http://dx.doi.org/10.13039/501100014710",
|
||||
"name": "PrecisionBiotics Group",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100014827",
|
||||
"uri": "http://dx.doi.org/10.13039/501100014827",
|
||||
"name": "Dormant Accounts Fund",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100016041",
|
||||
"uri": "http://dx.doi.org/10.13039/501100016041",
|
||||
"name": "St Vincents Anaesthesia Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100017501",
|
||||
"uri": "http://dx.doi.org/10.13039/501100017501",
|
||||
"name": "FotoNation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100018641",
|
||||
"uri": "http://dx.doi.org/10.13039/501100018641",
|
||||
"name": "Dairy Research Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100018839",
|
||||
"uri": "http://dx.doi.org/10.13039/501100018839",
|
||||
"name": "Irish Centre for High-End Computing",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100019905",
|
||||
"uri": "http://dx.doi.org/10.13039/501100019905",
|
||||
"name": "Galway University Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100020036",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020036",
|
||||
"name": "Dystonia Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100020221",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020221",
|
||||
"name": "Irish Motor Neurone Disease Association",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100020270",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020270",
|
||||
"name": "Advanced Materials and Bioengineering Research",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100020403",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020403",
|
||||
"name": "Irish Composites Centre",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100020425",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020425",
|
||||
"name": "Irish Thoracic Society",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100021102",
|
||||
"uri": "http://dx.doi.org/10.13039/501100021102",
|
||||
"name": "Waterford Institute of Technology",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100021110",
|
||||
"uri": "http://dx.doi.org/10.13039/501100021110",
|
||||
"name": "Irish MPS Society",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100021525",
|
||||
"uri": "http://dx.doi.org/10.13039/501100021525",
|
||||
"name": "Insight SFI Research Centre for Data Analytics",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100021838",
|
||||
"uri": "http://dx.doi.org/10.13039/501100021838",
|
||||
"name": "Royal College of Physicians of Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100022542",
|
||||
"uri": "http://dx.doi.org/10.13039/501100022542",
|
||||
"name": "Breakthrough Cancer Research",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100022610",
|
||||
"uri": "http://dx.doi.org/10.13039/501100022610",
|
||||
"name": "Breast Cancer Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100022728",
|
||||
"uri": "http://dx.doi.org/10.13039/501100022728",
|
||||
"name": "Munster Technological University",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100022729",
|
||||
"uri": "http://dx.doi.org/10.13039/501100022729",
|
||||
"name": "Institute of Technology, Tralee",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100023378",
|
||||
"uri": "http://dx.doi.org/10.13039/501100023378",
|
||||
"name": "Lauritzson Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100023551",
|
||||
"uri": "http://dx.doi.org/10.13039/501100023551",
|
||||
"name": "Cystic Fibrosis Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100023970",
|
||||
"uri": "http://dx.doi.org/10.13039/501100023970",
|
||||
"name": "Tyndall National Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100024094",
|
||||
"uri": "http://dx.doi.org/10.13039/501100024094",
|
||||
"name": "Raidi\u00f3 Teilif\u00eds \u00c9ireann",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100024242",
|
||||
"uri": "http://dx.doi.org/10.13039/501100024242",
|
||||
"name": "Synthesis and Solid State Pharmaceutical Centre",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100024313",
|
||||
"uri": "http://dx.doi.org/10.13039/501100024313",
|
||||
"name": "Irish Rugby Football Union",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100007490",
|
||||
"uri": "http://dx.doi.org/10.13039/100007490",
|
||||
"name": "Bausch and Lomb Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100007819",
|
||||
"uri": "http://dx.doi.org/10.13039/100007819",
|
||||
"name": "Allergan",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100010547",
|
||||
"uri": "http://dx.doi.org/10.13039/100010547",
|
||||
"name": "Irish Youth Justice Service",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100012733",
|
||||
"uri": "http://dx.doi.org/10.13039/100012733",
|
||||
"name": "National Parks and Wildlife Service",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015278",
|
||||
"uri": "http://dx.doi.org/10.13039/100015278",
|
||||
"name": "Pfizer Healthcare Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100017144",
|
||||
"uri": "http://dx.doi.org/10.13039/100017144",
|
||||
"name": "Shell E and P Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100022895",
|
||||
"uri": "http://dx.doi.org/10.13039/100022895",
|
||||
"name": "Health Research Institute, University of Limerick",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001599",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001599",
|
||||
"name": "National Council for Forest Research and Development",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100006554",
|
||||
"uri": "http://dx.doi.org/10.13039/501100006554",
|
||||
"name": "IDA Ireland",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100011626",
|
||||
"uri": "http://dx.doi.org/10.13039/501100011626",
|
||||
"name": "Energy Policy Research Centre, Economic and Social Research Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100014531",
|
||||
"uri": "http://dx.doi.org/10.13039/501100014531",
|
||||
"name": "Physical Education and Sport Sciences Department, University of Limerick",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100014745",
|
||||
"uri": "http://dx.doi.org/10.13039/501100014745",
|
||||
"name": "APC Microbiome Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100014826",
|
||||
"uri": "http://dx.doi.org/10.13039/501100014826",
|
||||
"name": "ADAPT - Centre for Digital Content Technology",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100020570",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020570",
|
||||
"name": "College of Medicine, Nursing and Health Sciences, National University of Ireland, Galway",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100020871",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020871",
|
||||
"name": "Bernal Institute, University of Limerick",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100023852",
|
||||
"uri": "http://dx.doi.org/10.13039/501100023852",
|
||||
"name": "Moore Institute for Research in the Humanities and Social Studies, University of Galway",
|
||||
"synonym": []
|
||||
}
|
||||
]
|
|
@ -0,0 +1,44 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 </value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
|
||||
|
||||
|
||||
</configuration>
|
|
@ -0,0 +1,131 @@
|
|||
<workflow-app name="generate_crossref_Datasource" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>The base path of Crossref DUMP </description>
|
||||
</property>
|
||||
<property>
|
||||
<name>unpaywallPath</name>
|
||||
<description>The base path of unpaywall DUMP </description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The Information service Lookup URL</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="StartTransaction"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="generateOAF"/>
|
||||
<error to="EndReadRollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="generateOAF">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Crossref TO OAF</name>
|
||||
<class>eu.dnetlib.dhp.collection.crossref.SparkMapDumpIntoOAF</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=2g
|
||||
--conf spark.sql.shuffle.partitions=3000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--unpaywallPath</arg><arg>${unpaywallPath}</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="EndReadRollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>READ_UNLOCK</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="RollBack"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,21 @@
|
|||
[
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mo",
|
||||
"paramLongName": "mdstoreOutputVersion",
|
||||
"paramDescription": "The mdstore output",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ma",
|
||||
"paramLongName": "magBasePath",
|
||||
"paramDescription": "The mag Base path",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
|
@ -0,0 +1,15 @@
|
|||
[
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mp",
|
||||
"paramLongName": "magBasePath",
|
||||
"paramDescription": "The base path of MAG DUMP CSV Tables",
|
||||
"paramRequired": true
|
||||
}
|
||||
|
||||
]
|
|
@ -0,0 +1,21 @@
|
|||
[
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "The as output Path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ma",
|
||||
"paramLongName": "magBasePath",
|
||||
"paramDescription": "The mag Base path",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
|
@ -0,0 +1,23 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,160 @@
|
|||
<workflow-app name="generate_MAG_Datasource" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>magBasePath</name>
|
||||
<description>The base path of MAG DUMP CSV Tables</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>resume_from</name>
|
||||
<value>generateOAF</value>
|
||||
<description>start Node</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="resume_from"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="generateTable">${wf:conf('resume_from') eq 'generateTable'}</case>
|
||||
<default to="StartTransaction"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
|
||||
<action name="generateTable">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Generate MAG Table</name>
|
||||
<class>eu.dnetlib.dhp.collection.mag.SparkCreateMagDenormalizedTable</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=2g
|
||||
--conf spark.sql.shuffle.partitions=3000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--magBasePath</arg><arg>${magBasePath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="StartTransaction"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="generateOAF"/>
|
||||
<error to="EndReadRollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="generateOAF">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MAG TO OAF</name>
|
||||
<class>eu.dnetlib.dhp.collection.mag.SparkMAGtoOAF</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=2g
|
||||
--conf spark.sql.shuffle.partitions=3000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--magBasePath</arg><arg>${magBasePath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="EndReadRollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>READ_UNLOCK</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="RollBack"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -390,6 +390,18 @@ base_dc:link (I used dc:identifier)
|
|||
</xsl:choose>
|
||||
</oaf:relation>
|
||||
</xsl:for-each>
|
||||
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.89</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:aggregator"
|
||||
classname="sysimport:crosswalk:aggregator"
|
||||
schemeid="dnet:provenanceActions"
|
||||
schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
|
||||
</metadata>
|
||||
<xsl:copy-of select="//*[local-name() = 'about']" />
|
||||
</record>
|
||||
|
|
|
@ -429,6 +429,17 @@
|
|||
</xsl:choose>
|
||||
</oaf:relation>
|
||||
</xsl:for-each>
|
||||
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.89</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:aggregator"
|
||||
classname="sysimport:crosswalk:aggregator"
|
||||
schemeid="dnet:provenanceActions"
|
||||
schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</metadata>
|
||||
<xsl:copy-of select="//*[local-name() = 'about']" />
|
||||
</record>
|
||||
|
|
|
@ -1048,5 +1048,11 @@
|
|||
"openaire_id": "re3data_____::r3d100010399",
|
||||
"datacite_name": "ZEW Forschungsdatenzentrum",
|
||||
"official_name": "ZEW Forschungsdatenzentrum"
|
||||
},
|
||||
"HBP.NEUROINF": {
|
||||
"openaire_id": "fairsharing_::2975",
|
||||
"datacite_name": "EBRAINS",
|
||||
"official_name": "EBRAINS"
|
||||
}
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
|
@ -8,19 +8,40 @@
|
|||
<name>database</name>
|
||||
<description>the PDB Database Working Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the Target Working dir path</description>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ConvertDB"/>
|
||||
<start to="StartTransaction"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="ConvertDB"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
<action name="ConvertDB">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -41,11 +62,48 @@
|
|||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--dbPath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--database</arg><arg>${database}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="RollBack"/>
|
||||
|
||||
</action>
|
||||
<end name="End"/>
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -2,5 +2,5 @@
|
|||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
|
||||
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the OAF target path ", "paramRequired": true}
|
||||
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}
|
||||
]
|
|
@ -1,5 +1,20 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true}
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source Path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mo",
|
||||
"paramLongName": "mdstoreOutputVersion",
|
||||
"paramDescription": "the oaf path ",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -9,34 +9,26 @@
|
|||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the OAF MDStore Path</description>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>resumeFrom</name>
|
||||
<value>DownloadEBILinks</value>
|
||||
<value>CreateEBIDataSet</value>
|
||||
<description>node to start</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="resume_from"/>
|
||||
<start to="StartTransaction"/>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
|
||||
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
||||
<case to="StartTransaction">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
|
||||
<default to="DownloadEBILinks"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -77,9 +69,29 @@
|
|||
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
|
||||
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
|
||||
</fs>
|
||||
<ok to="CreateEBIDataSet"/>
|
||||
<ok to="StartTransaction"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="CreateEBIDataSet"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="CreateEBIDataSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
|
@ -95,11 +107,49 @@
|
|||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,147 @@
|
|||
package eu.dnetlib.dhp.collection.crossref
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.{TransformationType, mergeUnpayWall}
|
||||
import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions.{col, explode, lower}
|
||||
import org.apache.spark.sql.types._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class SparkMapDumpIntoOAF(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info("sourcePath: {}", sourcePath)
|
||||
val unpaywallPath = parser.get("unpaywallPath")
|
||||
log.info("unpaywallPath: {}", unpaywallPath)
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
require(vocabularies != null)
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info(s"outputBasePath is '$outputBasePath'")
|
||||
val targetPath = s"$outputBasePath$MDSTORE_DATA_PATH"
|
||||
log.info(s"targetPath is '$targetPath'")
|
||||
transformCrossref(spark, sourcePath, targetPath, unpaywallPath, vocabularies)
|
||||
reportTotalSize(targetPath, outputBasePath)
|
||||
}
|
||||
|
||||
def transformUnpayWall(spark: SparkSession, unpaywallPath: String, crossrefPath: String): Dataset[UnpayWall] = {
|
||||
val schema = new StructType()
|
||||
.add(StructField("doi", StringType))
|
||||
.add(StructField("is_oa", BooleanType))
|
||||
.add(
|
||||
StructField(
|
||||
"best_oa_location",
|
||||
new StructType()
|
||||
.add("host_type", StringType)
|
||||
.add("license", StringType)
|
||||
.add("url", StringType)
|
||||
)
|
||||
)
|
||||
.add("oa_status", StringType)
|
||||
|
||||
import spark.implicits._
|
||||
val cId = spark.read
|
||||
.schema(new StructType().add("DOI", StringType))
|
||||
.json(crossrefPath)
|
||||
.withColumn("doi", lower(col("DOI")))
|
||||
|
||||
val uw = spark.read
|
||||
.schema(schema)
|
||||
.json(unpaywallPath)
|
||||
.withColumn("doi", lower(col("doi")))
|
||||
.where("is_oa = true and best_oa_location.url is not null")
|
||||
|
||||
uw.join(cId, uw("doi") === cId("doi"), "leftsemi").as[UnpayWall].cache()
|
||||
|
||||
}
|
||||
|
||||
def transformCrossref(
|
||||
spark: SparkSession,
|
||||
sourcePath: String,
|
||||
targetPath: String,
|
||||
unpaywallPath: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): Unit = {
|
||||
import spark.implicits._
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
|
||||
val dump: Dataset[String] = spark.read.text(sourcePath).as[String]
|
||||
dump
|
||||
.flatMap(s => Crossref2Oaf.convert(s, vocabularies, TransformationType.OnlyRelation))
|
||||
.as[Oaf]
|
||||
.map(r => mapper.writeValueAsString(r))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(targetPath)
|
||||
val uw = transformUnpayWall(spark, unpaywallPath, sourcePath)
|
||||
val resultCrossref: Dataset[(String, Result)] = dump
|
||||
.flatMap(s => Crossref2Oaf.convert(s, vocabularies, TransformationType.OnlyResult))
|
||||
.as[Oaf]
|
||||
.map(r => r.asInstanceOf[Result])
|
||||
.map(r => (r.getPid.get(0).getValue, r))(Encoders.tuple(Encoders.STRING, resultEncoder))
|
||||
resultCrossref
|
||||
.joinWith(uw, resultCrossref("_1").equalTo(uw("doi")), "left")
|
||||
.map(k => {
|
||||
mergeUnpayWall(k._1._2, k._2)
|
||||
})
|
||||
.map(r => mapper.writeValueAsString(r))
|
||||
.as[Result]
|
||||
.write
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.text(s"$targetPath")
|
||||
|
||||
// Generate affiliation relations:
|
||||
spark.read
|
||||
.json(sourcePath)
|
||||
.select(col("DOI"), explode(col("author.affiliation")).alias("affiliations"))
|
||||
.select(col("DOI"), explode(col("affiliations.id")).alias("aids"))
|
||||
.where("aids is not null")
|
||||
.select(col("DOI"), explode(col("aids")).alias("aff"))
|
||||
.select(col("DOI"), col("aff.id").alias("id"), col("aff.id-type").alias("idType"))
|
||||
.where(col("idType").like("ROR"))
|
||||
.flatMap(r => Crossref2Oaf.generateAffliation(r))
|
||||
.write
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.text(s"$targetPath")
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
object SparkMapDumpIntoOAF {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||
|
||||
new SparkMapDumpIntoOAF(
|
||||
log = logger,
|
||||
args = args,
|
||||
propertyPath = "/eu/dnetlib/dhp/collection/crossref/convert_crossref_dump_to_oaf_params.json"
|
||||
).initialize().run()
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,780 @@
|
|||
package eu.dnetlib.dhp.collection.mag
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{OafMapperUtils, PidType}
|
||||
import eu.dnetlib.dhp.schema.oaf.{
|
||||
Author,
|
||||
DataInfo,
|
||||
Instance,
|
||||
Journal,
|
||||
Organization,
|
||||
Publication,
|
||||
Relation,
|
||||
Result,
|
||||
Dataset => OafDataset
|
||||
}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
case class MAGPaper(
|
||||
paperId: Option[Long],
|
||||
doi: Option[String],
|
||||
docType: Option[String],
|
||||
paperTitle: Option[String],
|
||||
originalTitle: Option[String],
|
||||
bookTitle: Option[String],
|
||||
year: Option[Int],
|
||||
date: Option[String],
|
||||
onlineDate: Option[String],
|
||||
publisher: Option[String],
|
||||
journalId: Option[Long],
|
||||
journalName: Option[String],
|
||||
journalIssn: Option[String],
|
||||
journalPublisher: Option[String],
|
||||
conferenceSeriesId: Option[Long],
|
||||
conferenceInstanceId: Option[Long],
|
||||
conferenceName: Option[String],
|
||||
conferenceLocation: Option[String],
|
||||
conferenceStartDate: Option[String],
|
||||
conferenceEndDate: Option[String],
|
||||
volume: Option[String],
|
||||
issue: Option[String],
|
||||
firstPage: Option[String],
|
||||
lastPage: Option[String],
|
||||
referenceCount: Option[Long],
|
||||
citationCount: Option[Long],
|
||||
estimatedCitation: Option[Long],
|
||||
originalVenue: Option[String],
|
||||
familyId: Option[Long],
|
||||
familyRank: Option[Int],
|
||||
docSubTypes: Option[String],
|
||||
createdDate: Option[String],
|
||||
abstractText: Option[String],
|
||||
authors: Option[List[MAGAuthor]],
|
||||
urls: Option[List[String]]
|
||||
)
|
||||
|
||||
case class MAGAuthor(
|
||||
AffiliationId: Option[Long],
|
||||
AuthorSequenceNumber: Option[Int],
|
||||
AffiliationName: Option[String],
|
||||
AuthorName: Option[String],
|
||||
AuthorId: Option[Long],
|
||||
GridId: Option[String]
|
||||
)
|
||||
|
||||
object MagUtility extends Serializable {
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
|
||||
|
||||
private val MAGDataInfo: DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(true)
|
||||
di.setTrust("0.9")
|
||||
di.setProvenanceaction(
|
||||
OafMapperUtils.qualifier(
|
||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS
|
||||
)
|
||||
)
|
||||
di
|
||||
}
|
||||
|
||||
val datatypedict = Map(
|
||||
"bool" -> BooleanType,
|
||||
"int" -> IntegerType,
|
||||
"uint" -> IntegerType,
|
||||
"long" -> LongType,
|
||||
"ulong" -> LongType,
|
||||
"float" -> FloatType,
|
||||
"string" -> StringType,
|
||||
"DateTime" -> DateType
|
||||
)
|
||||
|
||||
val stream: Map[String, (String, Seq[String])] = Map(
|
||||
"Affiliations" -> Tuple2(
|
||||
"mag/Affiliations.txt",
|
||||
Seq(
|
||||
"AffiliationId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"GridId:string",
|
||||
"OfficialPage:string",
|
||||
"WikiPage:string",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"Iso3166Code:string",
|
||||
"Latitude:float?",
|
||||
"Longitude:float?",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"AuthorExtendedAttributes" -> Tuple2(
|
||||
"mag/AuthorExtendedAttributes.txt",
|
||||
Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")
|
||||
),
|
||||
"Authors" -> Tuple2(
|
||||
"mag/Authors.txt",
|
||||
Seq(
|
||||
"AuthorId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"LastKnownAffiliationId:long?",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"ConferenceInstances" -> Tuple2(
|
||||
"mag/ConferenceInstances.txt",
|
||||
Seq(
|
||||
"ConferenceInstanceId:long",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"ConferenceSeriesId:long",
|
||||
"Location:string",
|
||||
"OfficialUrl:string",
|
||||
"StartDate:DateTime?",
|
||||
"EndDate:DateTime?",
|
||||
"AbstractRegistrationDate:DateTime?",
|
||||
"SubmissionDeadlineDate:DateTime?",
|
||||
"NotificationDueDate:DateTime?",
|
||||
"FinalVersionDueDate:DateTime?",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"Latitude:float?",
|
||||
"Longitude:float?",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"ConferenceSeries" -> Tuple2(
|
||||
"mag/ConferenceSeries.txt",
|
||||
Seq(
|
||||
"ConferenceSeriesId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"EntityRelatedEntities" -> Tuple2(
|
||||
"advanced/EntityRelatedEntities.txt",
|
||||
Seq(
|
||||
"EntityId:long",
|
||||
"EntityType:string",
|
||||
"RelatedEntityId:long",
|
||||
"RelatedEntityType:string",
|
||||
"RelatedType:int",
|
||||
"Score:float"
|
||||
)
|
||||
),
|
||||
"FieldOfStudyChildren" -> Tuple2(
|
||||
"advanced/FieldOfStudyChildren.txt",
|
||||
Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")
|
||||
),
|
||||
"FieldOfStudyExtendedAttributes" -> Tuple2(
|
||||
"advanced/FieldOfStudyExtendedAttributes.txt",
|
||||
Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")
|
||||
),
|
||||
"FieldsOfStudy" -> Tuple2(
|
||||
"advanced/FieldsOfStudy.txt",
|
||||
Seq(
|
||||
"FieldOfStudyId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"MainType:string",
|
||||
"Level:int",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"Journals" -> Tuple2(
|
||||
"mag/Journals.txt",
|
||||
Seq(
|
||||
"JournalId:long",
|
||||
"Rank:uint",
|
||||
"NormalizedName:string",
|
||||
"DisplayName:string",
|
||||
"Issn:string",
|
||||
"Publisher:string",
|
||||
"Webpage:string",
|
||||
"PaperCount:long",
|
||||
"PaperFamilyCount:long",
|
||||
"CitationCount:long",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"PaperAbstractsInvertedIndex" -> Tuple2(
|
||||
"nlp/PaperAbstractsInvertedIndex.txt.*",
|
||||
Seq("PaperId:long", "IndexedAbstract:string")
|
||||
),
|
||||
"PaperAuthorAffiliations" -> Tuple2(
|
||||
"mag/PaperAuthorAffiliations.txt",
|
||||
Seq(
|
||||
"PaperId:long",
|
||||
"AuthorId:long",
|
||||
"AffiliationId:long?",
|
||||
"AuthorSequenceNumber:uint",
|
||||
"OriginalAuthor:string",
|
||||
"OriginalAffiliation:string"
|
||||
)
|
||||
),
|
||||
"PaperCitationContexts" -> Tuple2(
|
||||
"nlp/PaperCitationContexts.txt",
|
||||
Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")
|
||||
),
|
||||
"PaperExtendedAttributes" -> Tuple2(
|
||||
"mag/PaperExtendedAttributes.txt",
|
||||
Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")
|
||||
),
|
||||
"PaperFieldsOfStudy" -> Tuple2(
|
||||
"advanced/PaperFieldsOfStudy.txt",
|
||||
Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")
|
||||
),
|
||||
"PaperMeSH" -> Tuple2(
|
||||
"advanced/PaperMeSH.txt",
|
||||
Seq(
|
||||
"PaperId:long",
|
||||
"DescriptorUI:string",
|
||||
"DescriptorName:string",
|
||||
"QualifierUI:string",
|
||||
"QualifierName:string",
|
||||
"IsMajorTopic:bool"
|
||||
)
|
||||
),
|
||||
"PaperRecommendations" -> Tuple2(
|
||||
"advanced/PaperRecommendations.txt",
|
||||
Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")
|
||||
),
|
||||
"PaperReferences" -> Tuple2(
|
||||
"mag/PaperReferences.txt",
|
||||
Seq("PaperId:long", "PaperReferenceId:long")
|
||||
),
|
||||
"PaperResources" -> Tuple2(
|
||||
"mag/PaperResources.txt",
|
||||
Seq(
|
||||
"PaperId:long",
|
||||
"ResourceType:int",
|
||||
"ResourceUrl:string",
|
||||
"SourceUrl:string",
|
||||
"RelationshipType:int"
|
||||
)
|
||||
),
|
||||
"PaperUrls" -> Tuple2(
|
||||
"mag/PaperUrls.txt",
|
||||
Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")
|
||||
),
|
||||
"Papers" -> Tuple2(
|
||||
"mag/Papers.txt",
|
||||
Seq(
|
||||
"PaperId:long",
|
||||
"Rank:uint",
|
||||
"Doi:string",
|
||||
"DocType:string",
|
||||
"PaperTitle:string",
|
||||
"OriginalTitle:string",
|
||||
"BookTitle:string",
|
||||
"Year:int?",
|
||||
"Date:DateTime?",
|
||||
"OnlineDate:DateTime?",
|
||||
"Publisher:string",
|
||||
"JournalId:long?",
|
||||
"ConferenceSeriesId:long?",
|
||||
"ConferenceInstanceId:long?",
|
||||
"Volume:string",
|
||||
"Issue:string",
|
||||
"FirstPage:string",
|
||||
"LastPage:string",
|
||||
"ReferenceCount:long",
|
||||
"CitationCount:long",
|
||||
"EstimatedCitation:long",
|
||||
"OriginalVenue:string",
|
||||
"FamilyId:long?",
|
||||
"FamilyRank:uint?",
|
||||
"DocSubTypes:string",
|
||||
"CreatedDate:DateTime"
|
||||
)
|
||||
),
|
||||
"RelatedFieldOfStudy" -> Tuple2(
|
||||
"advanced/RelatedFieldOfStudy.txt",
|
||||
Seq(
|
||||
"FieldOfStudyId1:long",
|
||||
"Type1:string",
|
||||
"FieldOfStudyId2:long",
|
||||
"Type2:string",
|
||||
"Rank:float"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def getSchema(streamName: String): StructType = {
|
||||
var schema = new StructType()
|
||||
val d: Seq[String] = stream(streamName)._2
|
||||
d.foreach { t =>
|
||||
val currentType = t.split(":")
|
||||
val fieldName: String = currentType.head
|
||||
var fieldType: String = currentType.last
|
||||
val nullable: Boolean = fieldType.endsWith("?")
|
||||
if (nullable)
|
||||
fieldType = fieldType.replace("?", "")
|
||||
schema = schema.add(StructField(fieldName, datatypedict(fieldType), nullable))
|
||||
}
|
||||
schema
|
||||
}
|
||||
|
||||
def loadMagEntity(spark: SparkSession, entity: String, basePath: String): Dataset[Row] = {
|
||||
if (stream.contains(entity)) {
|
||||
val s = getSchema(entity)
|
||||
val pt = stream(entity)._1
|
||||
spark.read
|
||||
.option("header", "false")
|
||||
.option("charset", "UTF8")
|
||||
.option("delimiter", "\t")
|
||||
.schema(s)
|
||||
.csv(s"$basePath/$pt")
|
||||
} else
|
||||
null
|
||||
|
||||
}
|
||||
|
||||
def createResultFromType(magType: Option[String], source: Option[String]): Result = {
|
||||
var result: Result = null
|
||||
|
||||
if (magType == null || magType.orNull == null) {
|
||||
result = new Publication
|
||||
result.setDataInfo(MAGDataInfo)
|
||||
val i = new Instance
|
||||
i.setInstancetype(
|
||||
qualifier(
|
||||
"0038",
|
||||
"Other literature type",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
result.setInstance(List(i).asJava)
|
||||
return result
|
||||
}
|
||||
|
||||
val currentType: String = magType.get
|
||||
|
||||
val tp = currentType.toLowerCase match {
|
||||
case "book" =>
|
||||
result = new Publication
|
||||
qualifier("0002", "Book", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)
|
||||
case "bookchapter" =>
|
||||
result = new Publication
|
||||
qualifier(
|
||||
"00013",
|
||||
"Part of book or chapter of book",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
case "journal" =>
|
||||
result = new Publication
|
||||
qualifier("0043", "Journal", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)
|
||||
case "patent" =>
|
||||
if (source != null && source.orNull != null) {
|
||||
val s = source.get.toLowerCase
|
||||
if (s.contains("patent") || s.contains("brevet")) {
|
||||
result = new Publication
|
||||
qualifier(
|
||||
"0019",
|
||||
"Patent",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
} else if (s.contains("journal of")) {
|
||||
result = new Publication
|
||||
qualifier(
|
||||
"0043",
|
||||
"Journal",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
} else if (
|
||||
s.contains("proceedings") || s.contains("conference") || s.contains("workshop") || s.contains(
|
||||
"symposium"
|
||||
)
|
||||
) {
|
||||
result = new Publication
|
||||
qualifier(
|
||||
"0001",
|
||||
"Article",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
} else null
|
||||
} else null
|
||||
|
||||
case "repository" =>
|
||||
result = new Publication()
|
||||
qualifier(
|
||||
"0038",
|
||||
"Other literature type",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
|
||||
case "thesis" =>
|
||||
result = new Publication
|
||||
qualifier(
|
||||
"0044",
|
||||
"Thesis",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
case "dataset" =>
|
||||
result = new OafDataset
|
||||
qualifier(
|
||||
"0021",
|
||||
"Dataset",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
case "conference" =>
|
||||
result = new Publication
|
||||
qualifier(
|
||||
"0001",
|
||||
"Article",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
}
|
||||
|
||||
if (result != null) {
|
||||
result.setDataInfo(MAGDataInfo)
|
||||
val i = new Instance
|
||||
i.setInstancetype(tp)
|
||||
i.setInstanceTypeMapping(
|
||||
List(instanceTypeMapping(currentType, ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1)).asJava
|
||||
)
|
||||
result.setInstance(List(i).asJava)
|
||||
}
|
||||
result
|
||||
|
||||
}
|
||||
|
||||
def convertMAGtoOAF(paper: MAGPaper): String = {
|
||||
|
||||
// FILTER all the MAG paper with no URL
|
||||
if (paper.urls.orNull == null)
|
||||
return null
|
||||
|
||||
val result = createResultFromType(paper.docType, paper.originalVenue)
|
||||
if (result == null)
|
||||
return null
|
||||
|
||||
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
||||
var pidList = List(
|
||||
structuredProperty(
|
||||
paper.paperId.get.toString,
|
||||
qualifier(
|
||||
PidType.mag_id.toString,
|
||||
PidType.mag_id.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
null
|
||||
)
|
||||
)
|
||||
|
||||
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
||||
|
||||
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
|
||||
|
||||
val originalTitles = structuredProperty(paper.paperTitle.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||
|
||||
result.setTitle(List(originalTitles).asJava)
|
||||
|
||||
if (paper.date.orNull != null) {
|
||||
result.setDateofacceptance(field(paper.date.get, null))
|
||||
} else {
|
||||
if (paper.year.isDefined && paper.year.get > 1700) {
|
||||
result.setDateofacceptance(field(s"${paper.year.get}-01-01", null))
|
||||
}
|
||||
}
|
||||
|
||||
if (paper.onlineDate.orNull != null) {
|
||||
result.setRelevantdate(
|
||||
List(
|
||||
structuredProperty(
|
||||
paper.onlineDate.get,
|
||||
qualifier(
|
||||
"published-online",
|
||||
"published-online",
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
ModelConstants.DNET_DATACITE_DATE
|
||||
),
|
||||
null
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
}
|
||||
|
||||
if (paper.publisher.orNull != null) {
|
||||
result.setPublisher(field(paper.publisher.get, null))
|
||||
}
|
||||
|
||||
if (paper.date.isDefined)
|
||||
result.setDateofacceptance(field(paper.date.get, null))
|
||||
if (paper.onlineDate.orNull != null)
|
||||
result.setRelevantdate(
|
||||
List(
|
||||
structuredProperty(
|
||||
paper.onlineDate.get,
|
||||
qualifier(
|
||||
"published-online",
|
||||
"published-online",
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
ModelConstants.DNET_DATACITE_DATE
|
||||
),
|
||||
null
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
if (paper.publisher.isDefined)
|
||||
result.setPublisher(field(paper.publisher.get, null))
|
||||
|
||||
if (paper.journalId.isDefined && paper.journalName.isDefined) {
|
||||
val j = new Journal
|
||||
|
||||
j.setName(paper.journalName.get)
|
||||
j.setSp(paper.firstPage.orNull)
|
||||
j.setEp(paper.lastPage.orNull)
|
||||
if (paper.publisher.isDefined)
|
||||
result.setPublisher(field(paper.publisher.get, null))
|
||||
j.setIssnPrinted(paper.journalIssn.orNull)
|
||||
j.setVol(paper.volume.orNull)
|
||||
j.setIss(paper.issue.orNull)
|
||||
j.setConferenceplace(paper.conferenceLocation.orNull)
|
||||
result match {
|
||||
case publication: Publication => publication.setJournal(j)
|
||||
}
|
||||
}
|
||||
|
||||
if (paper.abstractText.isDefined)
|
||||
result.setDescription(List(field(paper.abstractText.get, null)).asJava)
|
||||
if (paper.authors.isDefined && paper.authors.get.nonEmpty) {
|
||||
result.setAuthor(
|
||||
paper.authors.get
|
||||
.filter(a => a.AuthorName.isDefined)
|
||||
.map(a => {
|
||||
val author = new Author
|
||||
author.setFullname(a.AuthorName.get)
|
||||
author
|
||||
})
|
||||
.asJava
|
||||
)
|
||||
}
|
||||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
if (paper.doi.orNull != null) {
|
||||
pidList = pidList ::: List(
|
||||
structuredProperty(
|
||||
paper.doi.get,
|
||||
qualifier(
|
||||
PidType.doi.toString,
|
||||
PidType.doi.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
null
|
||||
)
|
||||
)
|
||||
}
|
||||
instance.setPid(pidList.asJava)
|
||||
result.setPid(pidList.asJava)
|
||||
instance.setUrl(paper.urls.get.asJava)
|
||||
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
||||
instance.setCollectedfrom(MAGCollectedFrom)
|
||||
instance.setAccessright(
|
||||
accessRight(
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
)
|
||||
|
||||
if (paper.authors.orNull != null && paper.authors.get.nonEmpty)
|
||||
result.setAuthor(
|
||||
paper.authors.get
|
||||
.filter(a => a.AuthorName.orNull != null)
|
||||
.map { a =>
|
||||
val author = new Author
|
||||
author.setFullname(a.AuthorName.get)
|
||||
var authorPid = List(
|
||||
structuredProperty(
|
||||
a.AuthorId.get.toString,
|
||||
qualifier(
|
||||
PidType.mag_id.toString,
|
||||
PidType.mag_id.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
null
|
||||
)
|
||||
)
|
||||
if (a.GridId.orNull != null) {
|
||||
authorPid = authorPid ::: List(
|
||||
structuredProperty(
|
||||
a.AuthorId.get.toString,
|
||||
qualifier(
|
||||
PidType.mag_id.toString,
|
||||
PidType.mag_id.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
null
|
||||
)
|
||||
)
|
||||
}
|
||||
author.setPid(authorPid.asJava)
|
||||
author
|
||||
}
|
||||
.asJava
|
||||
)
|
||||
mapper.writeValueAsString(result)
|
||||
|
||||
}
|
||||
|
||||
def generateOrganization(r: Row): String = {
|
||||
|
||||
val o = new Organization
|
||||
val affId = s"20|mag_________::${DHPUtils.md5(r.getAs[Long]("AffiliationId").toString)}"
|
||||
o.setId(affId)
|
||||
o.setDataInfo(MAGDataInfo)
|
||||
o.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
||||
o.setLegalname(field(r.getAs[String]("DisplayName"), null))
|
||||
val gid = r.getAs[String]("GridId")
|
||||
if (gid != null) {
|
||||
o.setPid(
|
||||
List(
|
||||
structuredProperty(
|
||||
gid,
|
||||
qualifier(
|
||||
PidType.GRID.toString,
|
||||
PidType.GRID.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
null
|
||||
),
|
||||
structuredProperty(
|
||||
r.getAs[Long]("AffiliationId").toString,
|
||||
qualifier(
|
||||
PidType.mag_id.toString,
|
||||
PidType.mag_id.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
null
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
} else {
|
||||
o.setPid(
|
||||
List(
|
||||
structuredProperty(
|
||||
r.getAs[Long]("AffiliationId").toString,
|
||||
qualifier(
|
||||
PidType.mag_id.toString,
|
||||
PidType.mag_id.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
null
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
}
|
||||
val c = r.getAs[String]("Iso3166Code")
|
||||
if (c != null)
|
||||
o.setCountry(qualifier(c, c, "dnet:countries", "dnet:countries"))
|
||||
else
|
||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
|
||||
val ws = r.getAs[String]("OfficialPage")
|
||||
if (ws != null)
|
||||
o.setWebsiteurl(field(ws, null))
|
||||
val a = new AtomicAction[Organization]()
|
||||
a.setClazz(classOf[Organization])
|
||||
a.setPayload(o)
|
||||
mapper.writeValueAsString(a)
|
||||
}
|
||||
|
||||
def generateAffiliationRelations(paperAffiliation: Row): List[Relation] = {
|
||||
|
||||
val affId = s"20|mag_________::${DHPUtils.md5(paperAffiliation.getAs[Long]("AffiliationId").toString)}"
|
||||
val oafId = s"50|mag_________::${DHPUtils.md5(paperAffiliation.getAs[Long]("PaperId").toString)}"
|
||||
val r: Relation = new Relation
|
||||
r.setSource(oafId)
|
||||
r.setTarget(affId)
|
||||
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||
r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION)
|
||||
r.setSubRelType(ModelConstants.AFFILIATION)
|
||||
r.setDataInfo(MAGDataInfo)
|
||||
r.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
||||
val r1: Relation = new Relation
|
||||
r1.setTarget(oafId)
|
||||
r1.setSource(affId)
|
||||
r1.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||
r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF)
|
||||
r1.setSubRelType(ModelConstants.AFFILIATION)
|
||||
r1.setDataInfo(MAGDataInfo)
|
||||
r1.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
||||
List(r, r1)
|
||||
|
||||
}
|
||||
|
||||
def convertInvertedIndexString(json_input: String): String = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(json_input)
|
||||
val idl = (json \ "IndexLength").extract[Int]
|
||||
if (idl > 0) {
|
||||
val res = Array.ofDim[String](idl)
|
||||
|
||||
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
|
||||
|
||||
for { (k: String, v: List[Int]) <- iid } {
|
||||
v.foreach(item => res(item) = k)
|
||||
}
|
||||
(0 until idl).foreach(i => {
|
||||
if (res(i) == null)
|
||||
res(i) = ""
|
||||
})
|
||||
return res.mkString(" ")
|
||||
}
|
||||
""
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,199 @@
|
|||
package eu.dnetlib.dhp.collection.mag
|
||||
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types.{StringType, StructField, StructType}
|
||||
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val magBasePath: String = parser.get("magBasePath")
|
||||
log.info("found parameters magBasePath: {}", magBasePath)
|
||||
generatedDenormalizedMAGTable(spark, magBasePath)
|
||||
}
|
||||
|
||||
private def generatedDenormalizedMAGTable(
|
||||
spark: SparkSession,
|
||||
magBasePath: String
|
||||
): Unit = {
|
||||
|
||||
import spark.implicits._
|
||||
val schema: StructType = StructType(StructField("DOI", StringType) :: Nil)
|
||||
|
||||
//Filter all the MAG Papers that intersect with a Crossref DOI
|
||||
|
||||
val magPapers = MagUtility
|
||||
.loadMagEntity(spark, "Papers", magBasePath)
|
||||
.withColumn("Doi", lower(col("Doi")))
|
||||
|
||||
magPapers.cache()
|
||||
magPapers.count()
|
||||
//log.info("Create current abstract")
|
||||
|
||||
//Abstract is an inverted list, we define a function that convert in string the abstract and recreate
|
||||
// a table(PaperId, Abstract)
|
||||
val paperAbstract = MagUtility
|
||||
.loadMagEntity(spark, "PaperAbstractsInvertedIndex", magBasePath)
|
||||
.map(s => (s.getLong(0), MagUtility.convertInvertedIndexString(s.getString(1))))
|
||||
.withColumnRenamed("_1", "PaperId")
|
||||
.withColumnRenamed("_2", "Abstract")
|
||||
|
||||
//We define Step0 as the result of the Join between PaperIntersection and the PaperAbstract
|
||||
|
||||
val step0 = magPapers
|
||||
.join(paperAbstract, magPapers("PaperId") === paperAbstract("PaperId"), "left")
|
||||
.select(magPapers("*"), paperAbstract("Abstract"))
|
||||
.cache()
|
||||
|
||||
step0.count()
|
||||
|
||||
magPapers.unpersist()
|
||||
|
||||
// We have three table Author, Affiliation, and PaperAuthorAffiliation, in the
|
||||
//next step we create a table containing
|
||||
val authors = MagUtility.loadMagEntity(spark, "Authors", magBasePath)
|
||||
val affiliations = MagUtility.loadMagEntity(spark, "Affiliations", magBasePath)
|
||||
val paperAuthorAffiliations = MagUtility.loadMagEntity(spark, "PaperAuthorAffiliations", magBasePath)
|
||||
|
||||
val j1 = paperAuthorAffiliations
|
||||
.join(authors, paperAuthorAffiliations("AuthorId") === authors("AuthorId"), "inner")
|
||||
.select(
|
||||
col("PaperId"),
|
||||
col("AffiliationId"),
|
||||
col("AuthorSequenceNumber"),
|
||||
authors("DisplayName").alias("AuthorName"),
|
||||
authors("AuthorId")
|
||||
)
|
||||
|
||||
val paperAuthorAffiliationNormalized = j1
|
||||
.join(affiliations, j1("AffiliationId") === affiliations("AffiliationId"), "left")
|
||||
.select(j1("*"), affiliations("DisplayName").alias("AffiliationName"), affiliations("GridId"))
|
||||
.groupBy("PaperId")
|
||||
.agg(
|
||||
collect_list(
|
||||
struct("AffiliationId", "AuthorSequenceNumber", "AffiliationName", "AuthorName", "AuthorId", "GridId")
|
||||
).alias("authors")
|
||||
)
|
||||
val step1 = step0
|
||||
.join(paperAuthorAffiliationNormalized, step0("PaperId") === paperAuthorAffiliationNormalized("PaperId"), "left")
|
||||
.select(step0("*"), paperAuthorAffiliationNormalized("authors"))
|
||||
.cache()
|
||||
step1.count()
|
||||
|
||||
step0.unpersist()
|
||||
|
||||
val conference = MagUtility
|
||||
.loadMagEntity(spark, "ConferenceInstances", magBasePath)
|
||||
.select(
|
||||
$"ConferenceInstanceId",
|
||||
$"DisplayName".as("conferenceName"),
|
||||
$"Location".as("conferenceLocation"),
|
||||
$"StartDate".as("conferenceStartDate"),
|
||||
$"EndDate".as("conferenceEndDate")
|
||||
)
|
||||
|
||||
val step2 = step1
|
||||
.join(conference, step1("ConferenceInstanceId") === conference("ConferenceInstanceId"), "left")
|
||||
.select(
|
||||
step1("*"),
|
||||
conference("conferenceName"),
|
||||
conference("conferenceLocation"),
|
||||
conference("conferenceStartDate"),
|
||||
conference("conferenceEndDate")
|
||||
)
|
||||
.cache()
|
||||
step2.count()
|
||||
step1.unpersist()
|
||||
|
||||
val journals = MagUtility
|
||||
.loadMagEntity(spark, "Journals", magBasePath)
|
||||
.select(
|
||||
$"JournalId",
|
||||
$"DisplayName".as("journalName"),
|
||||
$"Issn".as("journalIssn"),
|
||||
$"Publisher".as("journalPublisher")
|
||||
)
|
||||
val step3 = step2
|
||||
.join(journals, step2("JournalId") === journals("JournalId"), "left")
|
||||
.select(
|
||||
step2("*"),
|
||||
journals("journalName"),
|
||||
journals("journalIssn"),
|
||||
journals("journalPublisher")
|
||||
)
|
||||
.cache
|
||||
step3.count()
|
||||
|
||||
val paper_urls = MagUtility
|
||||
.loadMagEntity(spark, "PaperUrls", magBasePath)
|
||||
.groupBy("PaperId")
|
||||
.agg(slice(collect_set("SourceUrl"), 1, 6).alias("urls"))
|
||||
.cache
|
||||
|
||||
paper_urls.count
|
||||
|
||||
step3
|
||||
.join(paper_urls, step3("PaperId") === paper_urls("PaperId"))
|
||||
.select(step3("*"), paper_urls("urls"))
|
||||
.select(
|
||||
$"PaperId".as("paperId"),
|
||||
$"Doi".as("doi"),
|
||||
$"DocType".as("docType"),
|
||||
$"PaperTitle".as("paperTitle"),
|
||||
$"OriginalTitle".as("originalTitle"),
|
||||
$"BookTitle".as("bookTitle"),
|
||||
$"Year".as("year"),
|
||||
$"Date".as("date"),
|
||||
$"OnlineDate".as("onlineDate"),
|
||||
$"Publisher".as("publisher"),
|
||||
$"JournalId".as("journalId"),
|
||||
$"ConferenceSeriesId".as("conferenceSeriesId"),
|
||||
$"ConferenceInstanceId".as("conferenceInstanceId"),
|
||||
$"Volume".as("volume"),
|
||||
$"Issue".as("issue"),
|
||||
$"FirstPage".as("firstPage"),
|
||||
$"LastPage".as("lastPage"),
|
||||
$"ReferenceCount".as("referenceCount"),
|
||||
$"CitationCount".as("citationCount"),
|
||||
$"EstimatedCitation".as("estimatedCitation"),
|
||||
$"OriginalVenue".as("originalVenue"),
|
||||
$"FamilyId".as("familyId"),
|
||||
$"FamilyRank".as("familyRank"),
|
||||
$"DocSubTypes".as("docSubTypes"),
|
||||
$"CreatedDate".as("createdDate"),
|
||||
$"Abstract".as("abstractText"),
|
||||
$"authors".as("authors"),
|
||||
$"conferenceName".as("conferenceName"),
|
||||
$"conferenceLocation".as("conferenceLocation"),
|
||||
$"conferenceStartDate".as("conferenceStartDate"),
|
||||
$"conferenceEndDate".as("conferenceEndDate"),
|
||||
$"journalName".as("journalName"),
|
||||
$"journalIssn".as("journalIssn"),
|
||||
$"journalPublisher".as("journalPublisher"),
|
||||
$"urls"
|
||||
)
|
||||
.write
|
||||
.mode("OverWrite")
|
||||
.save(s"$magBasePath/mag_denormalized")
|
||||
step3.unpersist()
|
||||
}
|
||||
}
|
||||
|
||||
object SparkCreateMagDenormalizedTable {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(SparkCreateMagDenormalizedTable.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new SparkCreateMagDenormalizedTable(
|
||||
"/eu/dnetlib/dhp/collection/mag/create_MAG_denormalized_table_properties.json",
|
||||
args,
|
||||
log
|
||||
).initialize().run()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package eu.dnetlib.dhp.collection.mag
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||
import org.apache.spark.sql.functions.col
|
||||
import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType}
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info(s"outputBasePath is '$outputBasePath'")
|
||||
val mdstorePath = s"$outputBasePath$MDSTORE_DATA_PATH"
|
||||
val magBasePath: String = parser.get("magBasePath")
|
||||
log.info("found parameters magBasePath: {}", magBasePath)
|
||||
convertMAG(spark, magBasePath, mdstorePath)
|
||||
generateAffiliations(spark, magBasePath, mdstorePath)
|
||||
reportTotalSize(mdstorePath, outputBasePath)
|
||||
}
|
||||
|
||||
def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
|
||||
import spark.implicits._
|
||||
|
||||
spark.read
|
||||
.load(s"$magBasePath/mag_denormalized")
|
||||
.as[MAGPaper]
|
||||
.filter(col("doi").isNotNull)
|
||||
.map(s => MagUtility.convertMAGtoOAF(s))
|
||||
.filter(s => s != null)
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.text(mdStorePath)
|
||||
|
||||
}
|
||||
|
||||
def generateAffiliations(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
|
||||
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])
|
||||
val schema = new StructType()
|
||||
.add(StructField("id", StringType))
|
||||
.add(StructField("originalId", ArrayType(StringType)))
|
||||
val generatedMag =
|
||||
spark.read.schema(schema).json(mdStorePath).selectExpr("explode(originalId) as PaperId").distinct()
|
||||
val paperAuthorAffiliations = MagUtility
|
||||
.loadMagEntity(spark, "PaperAuthorAffiliations", magBasePath)
|
||||
.where(col("AffiliationId").isNotNull)
|
||||
.select("PaperId", "AffiliationId")
|
||||
.distinct
|
||||
paperAuthorAffiliations
|
||||
.join(generatedMag, paperAuthorAffiliations("PaperId") === generatedMag("PaperId"), "leftsemi")
|
||||
.flatMap(r => MagUtility.generateAffiliationRelations(r))
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.json(mdStorePath)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
object SparkMAGtoOAF {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(SparkMAGtoOAF.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new SparkMAGtoOAF("/eu/dnetlib/dhp/collection/mag/convert_MAG_to_OAF_properties.json", args, log)
|
||||
.initialize()
|
||||
.run()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package eu.dnetlib.dhp.collection.mag
|
||||
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class SparkMagOrganizationAS(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val magBasePath: String = parser.get("magBasePath")
|
||||
log.info(s"magBasePath is $magBasePath")
|
||||
val outputPath: String = parser.get("outputPath")
|
||||
log.info(s"outputPath is $outputPath")
|
||||
generateAS(spark, magBasePath, outputPath)
|
||||
|
||||
}
|
||||
|
||||
def generateAS(spark: SparkSession, magBasePath: String, outputPath: String): Unit = {
|
||||
import spark.implicits._
|
||||
val organizations = MagUtility.loadMagEntity(spark, "Affiliations", magBasePath)
|
||||
organizations
|
||||
.map(r => MagUtility.generateOrganization(r))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(outputPath)
|
||||
}
|
||||
}
|
||||
|
||||
object SparkMagOrganizationAS {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(SparkMagOrganizationAS.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new SparkMagOrganizationAS("/eu/dnetlib/dhp/collection/mag/create_organization_AS.json", args, log)
|
||||
.initialize()
|
||||
.run()
|
||||
|
||||
}
|
||||
}
|
|
@ -46,20 +46,6 @@ class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], lo
|
|||
reportTotalSize(targetPath, outputBasePath)
|
||||
}
|
||||
|
||||
/** For working with MDStore we need to store in a file on hdfs the size of
|
||||
* the current dataset
|
||||
* @param targetPath
|
||||
* @param outputBasePath
|
||||
*/
|
||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||
val total_items = spark.read.text(targetPath).count()
|
||||
writeHdfsFile(
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
s"$total_items",
|
||||
outputBasePath + MDSTORE_SIZE_PATH
|
||||
)
|
||||
}
|
||||
|
||||
/** Generate the transformed and cleaned OAF Dataset from the native one
|
||||
*
|
||||
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
|
||||
|
|
|
@ -231,7 +231,7 @@ object BioDBToOAF {
|
|||
def uniprotToOAF(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pid = (json \ "pid").extract[String]
|
||||
val pid = (json \ "pid").extract[String].trim()
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue