forked from D-Net/dnet-hadoop
mergin with branch beta
This commit is contained in:
commit
eedf7c3310
|
@ -12,10 +12,6 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
@ -25,8 +21,12 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
||||
import eu.dnetlib.doiboost.orcid.util.DownloadsReport;
|
||||
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||
import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkDownloadOrcidAuthors {
|
||||
|
@ -72,16 +72,12 @@ public class SparkDownloadOrcidAuthors {
|
|||
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
|
||||
LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records");
|
||||
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
||||
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
|
||||
LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404");
|
||||
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
|
||||
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
|
||||
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
|
||||
LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
|
||||
LongAccumulator errorsAcc = spark.sparkContext().longAccumulator("errors");
|
||||
|
||||
logger.info("Retrieving data from lamda sequence file");
|
||||
String lambdaFilePath = workingPath + lambdaFileName;
|
||||
logger.info("Retrieving data from lamda sequence file: " + lambdaFilePath);
|
||||
JavaPairRDD<Text, Text> lamdaFileRDD = sc
|
||||
.sequenceFile(workingPath + lambdaFileName, Text.class, Text.class);
|
||||
.sequenceFile(lambdaFilePath, Text.class, Text.class);
|
||||
final long lamdaFileRDDCount = lamdaFileRDD.count();
|
||||
logger.info("Data retrieved: {}", lamdaFileRDDCount);
|
||||
|
||||
|
@ -102,47 +98,44 @@ public class SparkDownloadOrcidAuthors {
|
|||
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
||||
downloaded.setOrcidId(orcidId);
|
||||
downloaded.setLastModifiedDate(lastModifiedDate);
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||
final HttpClientParams clientParams = new HttpClientParams();
|
||||
MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams);
|
||||
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
|
||||
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
|
||||
httpConnector.setAuthToken(token);
|
||||
String apiUrl = "https://api.orcid.org/v3.0/" + orcidId + "/record";
|
||||
DownloadsReport report = new DownloadsReport();
|
||||
long startReq = System.currentTimeMillis();
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
boolean downloadCompleted = false;
|
||||
String record = "";
|
||||
try {
|
||||
record = httpConnector.getInputSource(apiUrl, report);
|
||||
downloadCompleted = true;
|
||||
} catch (CollectorException ce) {
|
||||
if (!report.isEmpty()) {
|
||||
int errCode = report.keySet().stream().findFirst().get();
|
||||
report.forEach((k, v) -> {
|
||||
logger.error(k + " " + v);
|
||||
});
|
||||
downloaded.setStatusCode(errCode);
|
||||
} else {
|
||||
downloaded.setStatusCode(-4);
|
||||
}
|
||||
errorsAcc.add(1);
|
||||
}
|
||||
long endReq = System.currentTimeMillis();
|
||||
long reqTime = endReq - startReq;
|
||||
if (reqTime < 1000) {
|
||||
Thread.sleep(1000 - reqTime);
|
||||
}
|
||||
int statusCode = response.getStatusLine().getStatusCode();
|
||||
downloaded.setStatusCode(statusCode);
|
||||
if (statusCode != 200) {
|
||||
switch (statusCode) {
|
||||
case 403:
|
||||
errorHTTP403Acc.add(1);
|
||||
break;
|
||||
case 404:
|
||||
errorHTTP404Acc.add(1);
|
||||
break;
|
||||
case 409:
|
||||
errorHTTP409Acc.add(1);
|
||||
break;
|
||||
case 503:
|
||||
errorHTTP503Acc.add(1);
|
||||
break;
|
||||
case 525:
|
||||
errorHTTP525Acc.add(1);
|
||||
break;
|
||||
default:
|
||||
errorHTTPGenericAcc.add(1);
|
||||
}
|
||||
return downloaded.toTuple2();
|
||||
if (downloadCompleted) {
|
||||
downloaded.setStatusCode(200);
|
||||
downloadedRecordsAcc.add(1);
|
||||
downloaded
|
||||
.setCompressedData(
|
||||
ArgumentApplicationParser
|
||||
.compressArgument(record));
|
||||
}
|
||||
downloadedRecordsAcc.add(1);
|
||||
downloaded
|
||||
.setCompressedData(
|
||||
ArgumentApplicationParser
|
||||
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
||||
client.close();
|
||||
return downloaded.toTuple2();
|
||||
};
|
||||
|
||||
|
@ -153,26 +146,17 @@ public class SparkDownloadOrcidAuthors {
|
|||
long authorsModifiedCount = authorsModifiedRDD.count();
|
||||
logger.info("Authors modified count: {}", authorsModifiedCount);
|
||||
|
||||
logger.info("Start downloading ...");
|
||||
|
||||
final JavaPairRDD<Text, Text> pairRDD = authorsModifiedRDD
|
||||
.repartition(100)
|
||||
.map(downloadRecordFn)
|
||||
.mapToPair(t -> new Tuple2<>(new Text(t._1()), new Text(t._2())));
|
||||
|
||||
saveAsSequenceFile(workingPath, outputPath, sc, pairRDD);
|
||||
|
||||
logger.info("parsedRecordsAcc: {}", parsedRecordsAcc.value());
|
||||
logger.info("modifiedRecordsAcc: {}", modifiedRecordsAcc.value());
|
||||
logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value());
|
||||
logger.info("errorHTTP403Acc: {}", errorHTTP403Acc.value());
|
||||
logger.info("errorHTTP404Acc: {}", errorHTTP404Acc.value());
|
||||
logger.info("errorHTTP409Acc: {}", errorHTTP409Acc.value());
|
||||
logger.info("errorHTTP503Acc: {}", errorHTTP503Acc.value());
|
||||
logger.info("errorHTTP525Acc: {}", errorHTTP525Acc.value());
|
||||
logger.info("errorHTTPGenericAcc: {}", errorHTTPGenericAcc.value());
|
||||
logger.info("errorsAcc: {}", errorsAcc.value());
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void saveAsSequenceFile(String workingPath, String outputPath, JavaSparkContext sc,
|
||||
|
|
|
@ -11,10 +11,6 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
@ -28,8 +24,12 @@ import com.google.gson.JsonElement;
|
|||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
||||
import eu.dnetlib.doiboost.orcid.util.DownloadsReport;
|
||||
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||
import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
@ -90,12 +90,7 @@ public class SparkDownloadOrcidWorks {
|
|||
.sparkContext()
|
||||
.longAccumulator("error_parsing_xml_found");
|
||||
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
||||
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
|
||||
LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404");
|
||||
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
|
||||
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
|
||||
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
|
||||
LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
|
||||
LongAccumulator errorsAcc = spark.sparkContext().longAccumulator("errors");
|
||||
|
||||
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
|
||||
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
|
||||
|
@ -149,51 +144,44 @@ public class SparkDownloadOrcidWorks {
|
|||
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
||||
downloaded.setOrcidId(orcidId);
|
||||
downloaded.setLastModifiedDate(lastUpdateValue);
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + relativeWorkUrl);
|
||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||
final HttpClientParams clientParams = new HttpClientParams();
|
||||
MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams);
|
||||
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
|
||||
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
|
||||
httpConnector.setAuthToken(token);
|
||||
String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorkUrl;
|
||||
DownloadsReport report = new DownloadsReport();
|
||||
long startReq = System.currentTimeMillis();
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
boolean downloadCompleted = false;
|
||||
String record = "";
|
||||
try {
|
||||
record = httpConnector.getInputSource(apiUrl, report);
|
||||
downloadCompleted = true;
|
||||
} catch (CollectorException ce) {
|
||||
if (!report.isEmpty()) {
|
||||
int errCode = report.keySet().stream().findFirst().get();
|
||||
report.forEach((k, v) -> {
|
||||
logger.error(k + " " + v);
|
||||
});
|
||||
downloaded.setStatusCode(errCode);
|
||||
} else {
|
||||
downloaded.setStatusCode(-4);
|
||||
}
|
||||
errorsAcc.add(1);
|
||||
}
|
||||
long endReq = System.currentTimeMillis();
|
||||
long reqTime = endReq - startReq;
|
||||
if (reqTime < 1000) {
|
||||
Thread.sleep(1000 - reqTime);
|
||||
}
|
||||
int statusCode = response.getStatusLine().getStatusCode();
|
||||
downloaded.setStatusCode(statusCode);
|
||||
if (statusCode != 200) {
|
||||
switch (statusCode) {
|
||||
case 403:
|
||||
errorHTTP403Acc.add(1);
|
||||
break;
|
||||
case 404:
|
||||
errorHTTP404Acc.add(1);
|
||||
break;
|
||||
case 409:
|
||||
errorHTTP409Acc.add(1);
|
||||
break;
|
||||
case 503:
|
||||
errorHTTP503Acc.add(1);
|
||||
break;
|
||||
case 525:
|
||||
errorHTTP525Acc.add(1);
|
||||
break;
|
||||
default:
|
||||
errorHTTPGenericAcc.add(1);
|
||||
logger
|
||||
.info(
|
||||
"Downloading {} status code: {}", orcidId,
|
||||
response.getStatusLine().getStatusCode());
|
||||
}
|
||||
return downloaded.toTuple2();
|
||||
if (downloadCompleted) {
|
||||
downloaded.setStatusCode(200);
|
||||
downloadedRecordsAcc.add(1);
|
||||
downloaded
|
||||
.setCompressedData(
|
||||
ArgumentApplicationParser
|
||||
.compressArgument(record));
|
||||
}
|
||||
downloadedRecordsAcc.add(1);
|
||||
downloaded
|
||||
.setCompressedData(
|
||||
ArgumentApplicationParser
|
||||
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
||||
client.close();
|
||||
return downloaded.toTuple2();
|
||||
};
|
||||
|
||||
|
@ -214,11 +202,7 @@ public class SparkDownloadOrcidWorks {
|
|||
logger.info("errorLoadingXMLFoundAcc: {}", errorLoadingXMLFoundAcc.value());
|
||||
logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value());
|
||||
logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value());
|
||||
logger.info("errorHTTP403Acc: {}", errorHTTP403Acc.value());
|
||||
logger.info("errorHTTP409Acc: {}", errorHTTP409Acc.value());
|
||||
logger.info("errorHTTP503Acc: {}", errorHTTP503Acc.value());
|
||||
logger.info("errorHTTP525Acc: {}", errorHTTP525Acc.value());
|
||||
logger.info("errorHTTPGenericAcc: {}", errorHTTPGenericAcc.value());
|
||||
logger.info("errorsAcc: {}", errorsAcc.value());
|
||||
});
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.util;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
|
||||
public class DownloadsReport extends LinkedHashMap<Integer, String> {
|
||||
|
||||
public DownloadsReport() {
|
||||
}
|
||||
}
|
|
@ -0,0 +1,272 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.util;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.MAPPER;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.*;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
|
||||
/**
|
||||
* Derived from eu.dnetlib.dhp.common.collection.HttpConnector2 with custom report and Bearer auth
|
||||
*
|
||||
* @author enrico
|
||||
*/
|
||||
public class MultiAttemptsHttpConnector {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MultiAttemptsHttpConnector.class);
|
||||
|
||||
private HttpClientParams clientParams;
|
||||
|
||||
private String responseType = null;
|
||||
|
||||
private static final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||
|
||||
private String authToken = "";
|
||||
private String acceptHeaderValue = "";
|
||||
private String authMethod = "";
|
||||
public final static String BEARER = "BEARER";
|
||||
|
||||
public MultiAttemptsHttpConnector() {
|
||||
this(new HttpClientParams());
|
||||
}
|
||||
|
||||
public MultiAttemptsHttpConnector(HttpClientParams clientParams) {
|
||||
this.clientParams = clientParams;
|
||||
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content via HTTP GET
|
||||
*
|
||||
* @param requestUrl the URL
|
||||
* @param report the list of errors
|
||||
* @return the content of the downloaded resource
|
||||
* @throws CollectorException when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public String getInputSource(final String requestUrl, DownloadsReport report)
|
||||
throws CollectorException {
|
||||
return attemptDownloadAsString(requestUrl, 1, report);
|
||||
}
|
||||
|
||||
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
|
||||
final DownloadsReport report) throws CollectorException {
|
||||
|
||||
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
|
||||
return IOUtils.toString(s);
|
||||
} catch (IOException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
|
||||
final DownloadsReport report) throws CollectorException, IOException {
|
||||
|
||||
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
|
||||
final String msg = String
|
||||
.format(
|
||||
"Max number of retries (%s/%s) exceeded, failing.",
|
||||
retryNumber, getClientParams().getMaxNumberOfRetry());
|
||||
log.error(msg);
|
||||
throw new CollectorException(msg);
|
||||
}
|
||||
|
||||
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
|
||||
|
||||
InputStream input = null;
|
||||
|
||||
try {
|
||||
if (getClientParams().getRequestDelay() > 0) {
|
||||
backoffAndSleep(getClientParams().getRequestDelay());
|
||||
}
|
||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
||||
urlConn.setInstanceFollowRedirects(false);
|
||||
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
|
||||
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
|
||||
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
|
||||
|
||||
if (!getAcceptHeaderValue().isEmpty()) {
|
||||
urlConn.addRequestProperty(HttpHeaders.ACCEPT, getAcceptHeaderValue());
|
||||
}
|
||||
if (!getAuthToken().isEmpty() && getAuthMethod().equals(BEARER)) {
|
||||
urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken()));
|
||||
}
|
||||
|
||||
if (log.isDebugEnabled()) {
|
||||
logHeaderFields(urlConn);
|
||||
}
|
||||
|
||||
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||
if (is2xx(urlConn.getResponseCode())) {
|
||||
input = urlConn.getInputStream();
|
||||
responseType = urlConn.getContentType();
|
||||
return input;
|
||||
}
|
||||
if (is3xx(urlConn.getResponseCode())) {
|
||||
// REDIRECTS
|
||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
||||
log.info("The requested url has been moved to {}", newUrl);
|
||||
report
|
||||
.put(
|
||||
urlConn.getResponseCode(),
|
||||
String.format("Moved to: %s", newUrl));
|
||||
urlConn.disconnect();
|
||||
if (retryAfter > 0) {
|
||||
backoffAndSleep(retryAfter);
|
||||
}
|
||||
return attemptDownload(newUrl, retryNumber + 1, report);
|
||||
}
|
||||
if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) {
|
||||
switch (urlConn.getResponseCode()) {
|
||||
case HttpURLConnection.HTTP_NOT_FOUND:
|
||||
case HttpURLConnection.HTTP_BAD_GATEWAY:
|
||||
case HttpURLConnection.HTTP_UNAVAILABLE:
|
||||
case HttpURLConnection.HTTP_GATEWAY_TIMEOUT:
|
||||
if (retryAfter > 0) {
|
||||
log
|
||||
.warn(
|
||||
"{} - waiting and repeating request after suggested retry-after {} sec.",
|
||||
requestUrl, retryAfter);
|
||||
backoffAndSleep(retryAfter * 1000);
|
||||
} else {
|
||||
log
|
||||
.warn(
|
||||
"{} - waiting and repeating request after default delay of {} sec.",
|
||||
requestUrl, getClientParams().getRetryDelay());
|
||||
backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
|
||||
}
|
||||
report.put(urlConn.getResponseCode(), requestUrl);
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, report);
|
||||
default:
|
||||
report
|
||||
.put(
|
||||
urlConn.getResponseCode(),
|
||||
String
|
||||
.format(
|
||||
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
|
||||
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
|
||||
}
|
||||
}
|
||||
throw new CollectorException(
|
||||
String
|
||||
.format(
|
||||
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
||||
MAPPER.writeValueAsString(report)));
|
||||
} catch (MalformedURLException | UnknownHostException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(-2, e.getMessage());
|
||||
throw new CollectorException(e.getMessage(), e);
|
||||
} catch (SocketTimeoutException | SocketException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(-3, e.getMessage());
|
||||
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
||||
return attemptDownload(requestUrl, retryNumber + 1, report);
|
||||
}
|
||||
}
|
||||
|
||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
||||
log.debug("StatusCode: {}", urlConn.getResponseMessage());
|
||||
|
||||
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
||||
if (e.getKey() != null) {
|
||||
for (String v : e.getValue()) {
|
||||
log.debug(" key: {} - value: {}", e.getKey(), v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
|
||||
log.info("I'm going to sleep for {}ms", sleepTimeMs);
|
||||
try {
|
||||
Thread.sleep(sleepTimeMs);
|
||||
} catch (InterruptedException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
||||
for (String key : headerMap.keySet()) {
|
||||
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (!headerMap.get(key).isEmpty())
|
||||
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
||||
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
|
||||
for (String key : headerMap.keySet()) {
|
||||
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
|
||||
return headerMap.get(key).get(0);
|
||||
}
|
||||
}
|
||||
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
|
||||
}
|
||||
|
||||
private boolean is2xx(final int statusCode) {
|
||||
return statusCode >= 200 && statusCode <= 299;
|
||||
}
|
||||
|
||||
private boolean is4xx(final int statusCode) {
|
||||
return statusCode >= 400 && statusCode <= 499;
|
||||
}
|
||||
|
||||
private boolean is3xx(final int statusCode) {
|
||||
return statusCode >= 300 && statusCode <= 399;
|
||||
}
|
||||
|
||||
private boolean is5xx(final int statusCode) {
|
||||
return statusCode >= 500 && statusCode <= 599;
|
||||
}
|
||||
|
||||
public String getResponseType() {
|
||||
return responseType;
|
||||
}
|
||||
|
||||
public HttpClientParams getClientParams() {
|
||||
return clientParams;
|
||||
}
|
||||
|
||||
public void setClientParams(HttpClientParams clientParams) {
|
||||
this.clientParams = clientParams;
|
||||
}
|
||||
|
||||
public void setAuthToken(String authToken) {
|
||||
this.authToken = authToken;
|
||||
}
|
||||
|
||||
private String getAuthToken() {
|
||||
return authToken;
|
||||
}
|
||||
|
||||
public String getAcceptHeaderValue() {
|
||||
return acceptHeaderValue;
|
||||
}
|
||||
|
||||
public void setAcceptHeaderValue(String acceptHeaderValue) {
|
||||
this.acceptHeaderValue = acceptHeaderValue;
|
||||
}
|
||||
|
||||
public String getAuthMethod() {
|
||||
return authMethod;
|
||||
}
|
||||
|
||||
public void setAuthMethod(String authMethod) {
|
||||
this.authMethod = authMethod;
|
||||
}
|
||||
}
|
|
@ -1,13 +1,11 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
@ -17,7 +15,6 @@ import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
|||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.apache.commons.compress.utils.Lists;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
|
@ -28,8 +25,11 @@ import org.junit.jupiter.api.Disabled;
|
|||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParserTest;
|
||||
import eu.dnetlib.doiboost.orcid.util.DownloadsReport;
|
||||
import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
|
||||
public class OrcidClientTest {
|
||||
|
@ -49,7 +49,7 @@ public class OrcidClientTest {
|
|||
|
||||
@BeforeAll
|
||||
private static void setUp() throws IOException {
|
||||
testPath = Files.createTempDirectory(XMLRecordParserTest.class.getName());
|
||||
testPath = Files.createTempDirectory(OrcidClientTest.class.getName());
|
||||
System.out.println("using test path: " + testPath);
|
||||
}
|
||||
|
||||
|
@ -349,4 +349,84 @@ public class OrcidClientTest {
|
|||
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
||||
logToFile(testPath, "\n\nwork updated \n\n" + work);
|
||||
}
|
||||
|
||||
@Test
|
||||
void downloadUnknownHostExceptionTest() throws Exception {
|
||||
logToFile(testPath, "downloadUnknownHostExceptionTest");
|
||||
final String orcid = "0000-0001-7291-3210";
|
||||
final HttpClientParams clientParams = new HttpClientParams();
|
||||
clientParams.setMaxNumberOfRetry(2);
|
||||
MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams);
|
||||
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
|
||||
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
|
||||
httpConnector.setAuthToken("78fdb232-7105-4086-8570-e153f4198e3d");
|
||||
String wrongApiUrl = "https://api.orcid_UNKNOWN.org/v3.0/" + orcid + "/" + REQUEST_TYPE_RECORD;
|
||||
String url = "UNKNOWN";
|
||||
DownloadsReport report = new DownloadsReport();
|
||||
try {
|
||||
httpConnector.getInputSource(wrongApiUrl, report);
|
||||
} catch (CollectorException ce) {
|
||||
logToFile(testPath, "CollectorException downloading: " + ce.getMessage());
|
||||
} catch (Throwable t) {
|
||||
logToFile(testPath, "Throwable downloading: " + t.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void downloadAttemptSuccessTest() throws Exception {
|
||||
logToFile(testPath, "downloadAttemptSuccessTest");
|
||||
final String orcid = "0000-0001-7291-3210";
|
||||
final HttpClientParams clientParams = new HttpClientParams();
|
||||
clientParams.setMaxNumberOfRetry(2);
|
||||
MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams);
|
||||
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
|
||||
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
|
||||
httpConnector.setAuthToken("78fdb232-7105-4086-8570-e153f4198e3d");
|
||||
String apiUrl = "https://api.orcid.org/v3.0/" + orcid + "/" + REQUEST_TYPE_RECORD;
|
||||
String url = "UNKNOWN";
|
||||
DownloadsReport report = new DownloadsReport();
|
||||
String record = httpConnector.getInputSource(apiUrl, report);
|
||||
logToFile(testPath, "Downloaded at first attempt record: " + record);
|
||||
}
|
||||
|
||||
@Test
|
||||
void downloadAttemptNotFoundTest() throws Exception {
|
||||
logToFile(testPath, "downloadAttemptNotFoundTest");
|
||||
final HttpClientParams clientParams = new HttpClientParams();
|
||||
clientParams.setMaxNumberOfRetry(2);
|
||||
MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams);
|
||||
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
|
||||
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
|
||||
httpConnector.setAuthToken("78fdb232-7105-4086-8570-e153f4198e3d");
|
||||
String apiUrl = "https://api.orcid.org/v3.0/NOTFOUND/" + REQUEST_TYPE_RECORD;
|
||||
DownloadsReport report = new DownloadsReport();
|
||||
try {
|
||||
httpConnector.getInputSource(apiUrl, report);
|
||||
} catch (CollectorException ce) {
|
||||
|
||||
}
|
||||
report.forEach((k, v) -> {
|
||||
try {
|
||||
logToFile(testPath, k + " " + v);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
void testDownloadedAuthor() throws Exception {
|
||||
final String base64CompressedWork = "H4sIAAAAAAAAAI2Yy26jMBSG932KiD0hIe1MiwiVZjGLkWbX2XRHsFOsgs3YJmnefszFFy4+mUhtVPz9P/gcH/vQ9PWrrjYXzAVh9Bjst7tgg2nBEKEfx+DP28/wOdgImVOUV4ziY3DDInjNHlKOC8ZRMnxtmlyWxyDaqU+ofg7h/uX7IYwfn+Ngo25ARUKoxJzm1TEopWySKLper1vGC4LU74+IikgTWoFRW+SyfyyfxCBag4iQhBawyoGMDjdqJrnECJAZRquYLDEPaV5jv8oyWlXj+qTiXZLGr7KMiQbnjAOR6IY1W7C6hgIwjGt6SKGfHsY13ajHYipLIcIyJ5Xw6+akdvjEtyt4wxEwM6+VGph5N2zYr2ENhQRhKsmZYChmS1j7nFs6VIBPOwImKhyfMVeFg6GAWEjrcoQ4FoBmBGwVXYhagGHDBIEX+ZzUDiqyn35VN6rJUpUJ4zc/PAI2T03FbrUKJZQszWjV3zavVOjvVfoE01qB+YUUQPGNwHTt3luxJjdqh1AxJFBKLWOrSeCcF13RtxxYtlPOPqH6m+MLwVfoMQ2kdae2ArLajc6fTxkI1nIoegs0yB426pMO+0fSw07xDKMu0XKSde5C2VvrlVMijRzFwqY7XTJI1QMLWcmEzMxtDdxfHiYSgTNJnYJ1K9y5k0tUrMgrnGGaRiuXxxuClulYUbr0nBvpkYLjvgTCGsuSoex3f1CEvRPHKI184NJKtKeaiO7cD5E61bJ4F+9DFd7d01u8Tw6H5BBvvz8f3q3nXLGIeJULGdaqeVBBRK7rS7h/fNvvk/gpedxt4923dxP7Fc3KtKuc1BhlkrfYmeN4dcmrhmbw60+HmWw2CKgbTuqc32CXKTTmeTWT6bDBjPsQ0DTpnchdaYO0ayQ2FyLIiVREqs25aU8VKYLRbK0BsyZuqvr1MU2Sm/rDdhe/2CRN6FU/b+oBVyj1zqRtC5F8kAumfTclsl+s7EoNQu64nfOaVLeezX60Z3XCULLi6GI2IZGTEeey7fec9lBAuXawIHKcpifE7GABHWfoxLVfpUNPBXoMbZWrHFsR3bPAk9J9i2sw9nW6AQT1mpk++7JhW+v44Hmt8PomJqfD13jRnvFOSxCKtu6qHoyBbQ7cMFo750UEfGaXm6bEeplXIXj2hvL6mA7tzvIwmM9pbJFBG834POZdLGi2gH2u9u0K9HMwn5PTioFWLufzmrS4oNuU9Pkt2rf/2jMs7fMdm2rQTTM+j+49AzToAVuXYA1mD2k0+XdE9vAP+JYR5NcQAAA=";
|
||||
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
||||
logToFile(testPath, "\n\ndownloaded author \n\n" + work);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
void testDownloadedWork() throws Exception {
|
||||
final String base64CompressedWork = "H4sIAAAAAAAAANVa63LiOBb+z1Oo+LVbhbkGAlTCLE1Id9IhTQV6unr/CVvB2tiWR5Khmal5rX2BfbE9ki3b3Jzt6Y13h6pQSPrOXTo6knL10zffQxvCBWXBdbVVb1YRCWzm0GB9Xf28vLX6VSQkDhzssYBcV3dEVH8aVa62jL8M1RcKI2kBAYwNLnrtXrMPFCGW7nW10YSPBX8dq3XRb1swNGgomkaG3FBBV9SjcnddDaOVR+0qApUCMaSBJDzA3nXVlTIcNhrb7bbOuE0d+F43AtEwCENBnMjGUhtyjiSFGBqHCkkDu5gqB0rpSMgJsCJOAVmKMVRMuoRbAfbJeaoMY6h84q8gQi4Nz1NlmNQbnDNe4Ak1bLA28/0iB8TjBg1GMV5gdzxu0CGoxSBKlkMkpp44T3eINBxeyG5bKDABpJb7QF1guRpOsd/iOWRRhwSSPlNS5LNjsOHzHAXxmjlHmwBSr3DyTDgsNVLkkAxk6LDjcCIKaBJAtoo2FCagFTJBiyf5IdJwUAv2PJUaNUgXlgnju/PgBJDFKfTYzgdXFgXLYAzVLxH2wPWvrfQ9mKEVhG+oXbD4EsD+3H1txqaxgQwBPqRFIc0w2WoSBHNbLfqIF0zbfVymIbQ52VCyLVIzBRm6VeQVRFWNHuoHDASLeJH3jqDVUQXB5yrOH0ObE5UNLQe+R+1mu2U1u1Z7sGy2hq3esN2tt5oXf79qnELv8fGwkJYPmxSswD1uA6vVXrY7w+5g2G3WuxedjNsJmj2escJx33G/ZXsU5iAs/AyRR0WcjpRXBLglc0lM1BjP59bX1qw9Hn/+dH87/dy9vBikeinKkyzVHjoqJNWIk7QuE3KU6pES6O7MwsarJh44QW1KowcWOCxAC9tlzEPsGX3YrYGQICgS0JKzENach2bEoTYNyKEQzaJyQnzSqesKSaV3IhRx92L8tLAm7GerjbZUujSwlFnIobqKkTuth+Q4ED4Vqqypp5JyfK8ah5Ji0f8AZVSGT2TZVGXfBLw/liOyqdRpJqfyXr8ldyEZrehKkm8Jr/2hc3Qb7EVk9DfMJbU98pu3k+6aETXXBebCZpt23tBaBUfSZRxdo98eYmgNfRxrh3zAnldDM/37FvZ+IiWtoQfddgiaEGBIDGCG7btA7jgBP9svAK2h90l4yYqIGop5jgMHXA4J0NB9ksR+YTX0qFtfqACO01jGjDHFPx552AW2W0P3uvGROk4NLfTvCeNS8X9MaDg1rL9Qz6PYh7En3f4ZNmKS6nUfQYFmE6PYe05IYBqPFGaq5wHlYpaoDbYqxokVK+JBerz51z+BIzc+SfSdTHVrTiSYtZzGFNOdGrr5ohsLF2+NUguqppkDoua6/S6yXwAYu44pM+/HiZ1BwEDWMqYbC5fjZ+MEBwMjb4PRLdTFYWrUwiUhJH/H+G3pMl/7fjqJhTGwSwU5lnfLsVDmxIPvmRetbJeCOsvfaxWXbXWxLVziqNky51BLW1OP2JKzgNoASSa7Gk1WAfrLI9mirzBBIUD1r/W/AgrMla7CjEMOzYBJolo30/mnxd0SzadPt5+eZtMb9O7rEN1wNINgEA8Ha+IxNMdrHLCQRR4TFRCudnmB7m6GqD0YDCqW+lQqlfnndw93iw/TJ/RwN5k+TqZDNJkAQyUvUlWvktjrdgbQEeI1EapN8Grd7MOeYJlfajSxWVOMfcIhVQXgfcFsqhcceobVA/U3GjsbDCYrjVSKSz0wHo8Xym6dArRvvjsbAfUGouFr8s5lG9o72DVVSy1saDqMqlarWW+12r2GiIXXMzuAU6AQcLLqWf3mZRf6iOlsNQdda9BudhQnvNNdPWN8XA7BgU5G2k3pLADA75XD3BSnn3y+3M90SbZWGczkxiRVmfSaJrd0V8u0yG3CeYRyht7O07Ste45weuqNmhcpLO44woEPRq1eilLN/f3ntEqGPFfzi2PmudHTO3EOEKf60LdTyUeDr7KIIzKfTfqtdr896JxklQtbES/IQD7UyL+SZIJSXYhLHkHZ9oqEjPR1MRzWu550cDYdCeI9n+S4hzouUU76+UeCQJ0fjkKn0+v3m703i0Eh/z97BCDH/XAAziTIt4rH94j7s4dHbSY/HJ90e3qriBQL+MMxCGETs9j/QxiSQ5PaS63/QsZqdS8vOxdvtj7Oc//fL4dTI2LvDAfVA6erSDKe3+cPxw70j4c5HHZlfLT9iAEZYKjZkxOYKZxymJy659l/t+QZllC5bvVJrzShD5GN0/NkiaZyqNcJh0NrdngtTfp7wviaHB+SS1Ng7O+Sk3h5HodT4S8RyY78pUmGM6eEg1l8tVCa1KnvY/SgrzDKsxRLF46j+uahNKH3BE6lsIb1lUxpUhdS3WUE+u6nPP/qiyAsklumMhMz9SBNqeus0oQ+QXqwIa7m3qy87IhXnBLPI8kVXXlZMaASm5vAEqWuKYkvHMtbPdiPiIdm6dVmeVMZjX+lfnKDWmaRAT7ev6ctTfhEF3RoWnJeXlKfSXcHcsf69rk0wTd4Qx30RV9yl5et2Ipwqe/SS5MJXiU8vbIv2b/qZaC8PZ65AUwj9QJR3vx1mQ9b7VPy1FFebnSpWq7xi0qJuwA+fLYpL7rwJdLXobcSa97kM4Cl35f3YXmofp0+8R9gBc/XeXL9Vn38pH7mLTs27z9T8ky1n7ynlZ0I4le78rYzl6t/woG5krwQlpcRcLDD2UPkH5F73C9G5tFKfY0q/wa1TIHI0CgAAA==";
|
||||
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
||||
logToFile(testPath, "\n\ndownloaded work \n\n" + work);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|||
import eu.dnetlib.dhp.schema.oaf.AccessRight;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable {
|
||||
|
||||
|
@ -24,17 +25,31 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
|||
CleaningRuleMap mapping = new CleaningRuleMap();
|
||||
mapping.put(Qualifier.class, o -> cleanQualifier(vocabularies, (Qualifier) o));
|
||||
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
|
||||
mapping.put(Country.class, o -> {
|
||||
final Country c = (Country) o;
|
||||
if (StringUtils.isBlank(c.getSchemeid())) {
|
||||
c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
|
||||
c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
|
||||
}
|
||||
cleanQualifier(vocabularies, c);
|
||||
});
|
||||
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
|
||||
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
|
||||
return mapping;
|
||||
}
|
||||
|
||||
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {
|
||||
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) {
|
||||
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType());
|
||||
r.setSubRelType(newValue.getClassid());
|
||||
}
|
||||
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_RELCLASS)) {
|
||||
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_RELCLASS, r.getRelClass());
|
||||
r.setRelClass(newValue.getClassid());
|
||||
}
|
||||
}
|
||||
|
||||
private static void cleanCountry(VocabularyGroup vocabularies, Country o) {
|
||||
final Country c = o;
|
||||
if (StringUtils.isBlank(c.getSchemeid())) {
|
||||
c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
|
||||
c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
|
||||
}
|
||||
cleanQualifier(vocabularies, c);
|
||||
}
|
||||
|
||||
private static <Q extends Qualifier> void cleanQualifier(VocabularyGroup vocabularies, Q q) {
|
||||
if (vocabularies.vocabularyExists(q.getSchemeid())) {
|
||||
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
|
||||
|
|
|
@ -5,6 +5,7 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
@ -16,12 +17,12 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
|||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
@ -29,7 +30,8 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|||
@ExtendWith(MockitoExtension.class)
|
||||
public class GraphCleaningFunctionsTest {
|
||||
|
||||
public static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
public static final ObjectMapper MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@Mock
|
||||
private ISLookUpService isLookUpService;
|
||||
|
@ -49,6 +51,23 @@ public class GraphCleaningFunctionsTest {
|
|||
mapping = CleaningRuleMap.create(vocabularies);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testCleanRelations() throws Exception {
|
||||
|
||||
List<String> lines = IOUtils
|
||||
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/relation.json"));
|
||||
for (String json : lines) {
|
||||
Relation r_in = MAPPER.readValue(json, Relation.class);
|
||||
assertNotNull(r_in);
|
||||
|
||||
assertFalse(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_in.getRelClass()));
|
||||
|
||||
Relation r_out = OafCleaner.apply(r_in, mapping);
|
||||
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_out.getRelClass()));
|
||||
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r_out.getSubRelType()));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void testCleaning() throws Exception {
|
||||
|
||||
|
@ -87,7 +106,7 @@ public class GraphCleaningFunctionsTest {
|
|||
p_out
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(p -> p.getQualifier())
|
||||
.map(StructuredProperty::getQualifier)
|
||||
.allMatch(q -> pidTerms.contains(q.getClassid())));
|
||||
|
||||
List<Instance> poi = p_out.getInstance();
|
||||
|
@ -101,8 +120,8 @@ public class GraphCleaningFunctionsTest {
|
|||
assertEquals(2, poii.getPid().size());
|
||||
|
||||
assertTrue(
|
||||
poii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent());
|
||||
assertTrue(poii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent());
|
||||
poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
||||
assertTrue(poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
|
||||
|
||||
assertNotNull(poii.getAlternateIdentifier());
|
||||
assertEquals(2, poii.getAlternateIdentifier().size());
|
||||
|
@ -111,16 +130,12 @@ public class GraphCleaningFunctionsTest {
|
|||
poii
|
||||
.getAlternateIdentifier()
|
||||
.stream()
|
||||
.filter(s -> s.getValue().equals("10.1007/s109090161569x"))
|
||||
.findFirst()
|
||||
.isPresent());
|
||||
.anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
||||
assertTrue(
|
||||
poii
|
||||
.getAlternateIdentifier()
|
||||
.stream()
|
||||
.filter(s -> s.getValue().equals("10.1009/qwerty"))
|
||||
.findFirst()
|
||||
.isPresent());
|
||||
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||
|
||||
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
|
||||
|
||||
|
@ -142,8 +157,8 @@ public class GraphCleaningFunctionsTest {
|
|||
assertEquals(2, pcii.getPid().size());
|
||||
|
||||
assertTrue(
|
||||
pcii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent());
|
||||
assertTrue(pcii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent());
|
||||
pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
||||
assertTrue(pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
|
||||
|
||||
assertNotNull(pcii.getAlternateIdentifier());
|
||||
assertEquals(1, pcii.getAlternateIdentifier().size());
|
||||
|
@ -151,9 +166,7 @@ public class GraphCleaningFunctionsTest {
|
|||
pcii
|
||||
.getAlternateIdentifier()
|
||||
.stream()
|
||||
.filter(s -> s.getValue().equals("10.1009/qwerty"))
|
||||
.findFirst()
|
||||
.isPresent());
|
||||
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||
|
||||
getAuthorPids(p_cleaned).forEach(pid -> {
|
||||
System.out
|
||||
|
@ -172,17 +185,17 @@ public class GraphCleaningFunctionsTest {
|
|||
return pub
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.map(a -> a.getPid())
|
||||
.flatMap(p -> p.stream())
|
||||
.map(s -> s.getQualifier());
|
||||
.map(Author::getPid)
|
||||
.flatMap(Collection::stream)
|
||||
.map(StructuredProperty::getQualifier);
|
||||
}
|
||||
|
||||
private Stream<StructuredProperty> getAuthorPids(Result pub) {
|
||||
return pub
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.map(a -> a.getPid())
|
||||
.flatMap(p -> p.stream());
|
||||
.map(Author::getPid)
|
||||
.flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
private List<String> vocs() throws IOException {
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
{"relType":"resultResult","subRelType":"citation","relClass":"cites","source":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","target":"50|openaire____::007a4870b31056f89b768cf508e1538e"}
|
||||
{"relType":"resultResult","subRelType":"citation","relClass":"isCitedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
||||
{"relType":"resultResult","subRelType":"supplement","relClass":"isSupplementTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
||||
{"relType":"resultResult","subRelType":"supplement","relClass":"isSupplementedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
||||
{"relType":"resultResult","subRelType":"part","relClass":"isPartOf","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
||||
{"relType":"resultResult","subRelType":"part","relClass":"hasPart","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
||||
{"relType":"resultResult","subRelType":"review","relClass":"isReviewedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
||||
{"relType":"resultResult","subRelType":"review","relClass":"reviews","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
||||
{"relType":"resultResult","subRelType":"relationship","relClass":"isRelatedTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
||||
{"relType":"resultResult","subRelType":"publicationDataset","relClass":"isRelatedTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
|
|
@ -1231,4 +1231,14 @@ dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-学術雑誌論文(査
|
|||
dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-紀要論文(査読有り)
|
||||
dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-雑誌記事(査読有り)
|
||||
dnet:review_levels @=@ 0001 @=@ 原著論文(査読有り)
|
||||
dnet:review_levels @=@ 0001 @=@ 査読論文
|
||||
dnet:review_levels @=@ 0001 @=@ 査読論文
|
||||
dnet:relation_relClass @=@ Cites @=@ cites
|
||||
dnet:relation_relClass @=@ IsCitedBy @=@ isCitedBy
|
||||
dnet:relation_relClass @=@ HasPart @=@ hasPart
|
||||
dnet:relation_relClass @=@ IsPartOf @=@ isPartOf
|
||||
dnet:relation_relClass @=@ IsReviewedBy @=@ isReviewedBy
|
||||
dnet:relation_relClass @=@ Reviews @=@ reviews
|
||||
dnet:relation_relClass @=@ IsSupplementTo @=@ isSupplementTo
|
||||
dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy
|
||||
dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo
|
||||
dnet:relation_subRelType @=@ relationship @=@ publicationDataset
|
|
@ -1079,4 +1079,41 @@ dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED
|
|||
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/AUTHOR/ORCID @=@ An Open Researcher and Contributor ID (ORCID) that can be associated to an author of your publications
|
||||
dnet:review_levels @=@ dnet:review_levels @=@ 0000 @=@ Unknown
|
||||
dnet:review_levels @=@ dnet:review_levels @=@ 0002 @=@ nonPeerReviewed
|
||||
dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed
|
||||
dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Cites @=@ Cites
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsCitedBy @=@ IsCitedBy
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasPart @=@ HasPart
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsPartOf @=@ IsPartOf
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsReviewedBy @=@ IsReviewedBy
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Reviews @=@ Reviews
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSupplementTo @=@ IsSupplementTo
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSupplementedBy @=@ IsSupplementedBy
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsRelatedTo @=@ IsRelatedTo
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Compiles @=@ Compiles
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Continues @=@ Continues
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Documents @=@ Documents
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasAmongTopNSimilarDocuments @=@ HasAmongTopNSimilarDocuments
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasVersion @=@ HasVersion
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsAmongTopNSimilarDocuments @=@ IsAmongTopNSimilarDocuments
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsCompiledBy @=@ IsCompiledBy
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsContinuedBy @=@ IsContinuedBy
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsDerivedFrom @=@ IsDerivedFrom
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsDocumentedBy @=@ IsDocumentedBy
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsNewVersionOf @=@ IsNewVersionOf
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsObsoletedBy @=@ IsObsoletedBy
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsOriginalFormOf @=@ IsOriginalFormOf
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsPreviousVersionOf @=@ IsPreviousVersionOf
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSourceOf @=@ IsSourceOf
|
||||
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsVariantFormOf @=@ IsVariantFormOf
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ affiliation @=@ affiliation
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ citation @=@ citation
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ dedup @=@ dedup
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ outcome @=@ outcome
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ part @=@ part
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ participation @=@ participation
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ provision @=@ provision
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ relationship @=@ relationship
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement
|
||||
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
|
2
pom.xml
2
pom.xml
|
@ -753,7 +753,7 @@
|
|||
<mockito-core.version>3.3.3</mockito-core.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<dhp-schemas.version>[2.7.17]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[2.7.18]</dhp-schemas.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
|
|
Loading…
Reference in New Issue