diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index c2f76cff7..bfec019af 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -70,10 +70,7 @@
com.ibm.icu
icu4j
-
- org.apache.hadoop
- hadoop-common
-
+
com.github.sisyphsu
dateparser
@@ -163,7 +160,7 @@
eu.dnetlib.dhp
- ${dhp-schemas.artifact}
+ dhp-schemas
@@ -172,4 +169,23 @@
+
+
+
+ spark-34
+
+
+ javax.xml.bind
+ jaxb-api
+ 2.2.11
+
+
+ com.sun.xml.ws
+ jaxws-ri
+ 2.3.3
+ pom
+
+
+
+
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
index 997948687..f4ba09f72 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@@ -1,6 +1,7 @@
package eu.dnetlib.dhp.collection.plugin.rest;
+import java.util.Map;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
@@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
+import com.google.gson.Gson;
+
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
final String entityXpath = api.getParams().get("entityXpath");
final String authMethod = api.getParams().get("authMethod");
final String authToken = api.getParams().get("authToken");
+ final String requestHeaderMap = api.getParams().get("requestHeaderMap");
+ Gson gson = new Gson();
+ Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
final String resultSizeValue = Optional
.ofNullable(api.getParams().get("resultSizeValue"))
.filter(StringUtils::isNotBlank)
@@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
if (StringUtils.isBlank(resultFormatValue)) {
throw new CollectorException("Param 'resultFormatValue' is null or empty");
}
- if (StringUtils.isBlank(queryParams)) {
- throw new CollectorException("Param 'queryParams' is null or empty");
- }
if (StringUtils.isBlank(entityXpath)) {
throw new CollectorException("Param 'entityXpath' is null or empty");
}
@@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
entityXpath,
authMethod,
authToken,
- resultOutputFormat);
+ resultOutputFormat,
+ requestHeaders);
return StreamSupport
.stream(
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
index 76af6cff1..2518fd92f 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@@ -9,6 +9,7 @@ import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
+import java.util.Map;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
@@ -34,6 +35,8 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
+import com.google.common.collect.Maps;
+
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
@@ -55,7 +58,7 @@ public class RestIterator implements Iterator {
private final HttpClientParams clientParams;
- private final String BASIC = "basic";
+ private final String AUTHBASIC = "basic";
private final String baseUrl;
private final String resumptionType;
@@ -89,6 +92,11 @@ public class RestIterator implements Iterator {
*/
private final String resultOutputFormat;
+ /*
+ * Can be used to set additional request headers, like for content negotiation
+ */
+ private Map requestHeaders;
+
/**
* RestIterator class compatible to version 1.3.33
*/
@@ -107,7 +115,8 @@ public class RestIterator implements Iterator {
final String entityXpath,
final String authMethod,
final String authToken,
- final String resultOutputFormat) {
+ final String resultOutputFormat,
+ final Map requestHeaders) {
this.clientParams = clientParams;
this.baseUrl = baseUrl;
@@ -119,6 +128,7 @@ public class RestIterator implements Iterator {
this.authMethod = authMethod;
this.authToken = authToken;
this.resultOutputFormat = resultOutputFormat;
+ this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
: "";
@@ -231,25 +241,20 @@ public class RestIterator implements Iterator {
final URL qUrl = new URL(query);
log.debug("authMethod: {}", this.authMethod);
- if ("bearer".equalsIgnoreCase(this.authMethod)) {
- log.trace("authMethod before inputStream: {}", resultXml);
- final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
- conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken);
- conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
- conn.setRequestMethod("GET");
- theHttpInputStream = conn.getInputStream();
- } else if (this.BASIC.equalsIgnoreCase(this.authMethod)) {
- log.trace("authMethod before inputStream: {}", resultXml);
- final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
- conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
- conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
- conn.setRequestMethod("GET");
- theHttpInputStream = conn.getInputStream();
- } else {
- theHttpInputStream = qUrl.openStream();
+ if (this.authMethod == "bearer") {
+ log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+ requestHeaders.put("Authorization", "Bearer " + authToken);
+ // requestHeaders.put("Content-Type", "application/json");
+ } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+ log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+ requestHeaders.put("Authorization", "Basic " + authToken);
+ // requestHeaders.put("accept", "application/xml");
}
+ HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+ conn.setRequestMethod("GET");
+ this.setRequestHeader(conn);
+ resultStream = conn.getInputStream();
- this.resultStream = theHttpInputStream;
if ("json".equals(this.resultOutputFormat)) {
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
resultXml = JsonUtils.convertToXML(resultJson);
@@ -380,7 +385,8 @@ public class RestIterator implements Iterator {
try {
if (this.resultTotal == -1) {
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
- if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) {
+ if ("page".equalsIgnoreCase(this.resumptionType)
+ && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
this.resultTotal += 1;
} // to correct the upper bound
log.info("resultTotal was -1 is now: " + this.resultTotal);
@@ -433,6 +439,22 @@ public class RestIterator implements Iterator {
}
}
+ /**
+ * setRequestHeader
+ *
+ * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
+ * @param conn
+ */
+ private void setRequestHeader(HttpURLConnection conn) {
+ if (requestHeaders != null) {
+ for (String key : requestHeaders.keySet()) {
+ conn.setRequestProperty(key, requestHeaders.get(key));
+ }
+ log.debug("Set Request Header with: " + requestHeaders);
+ }
+
+ }
+
public String getResultFormatValue() {
return this.resultFormatValue;
}
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
index 90f4c7f25..0e64f8bab 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@@ -39,8 +39,8 @@ public class OsfPreprintCollectorTest {
private final String resumptionType = "page";
private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
- private final String resultSizeParam = "";
- private final String resultSizeValue = "";
+ private final String resultSizeParam = "page[size]";
+ private final String resultSizeValue = "100";
private final String resultFormatParam = "format";
private final String resultFormatValue = "json";
@@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest {
final AtomicInteger i = new AtomicInteger(0);
final Stream stream = this.rcp.collect(this.api, new AggregatorReport());
- stream.limit(200).forEach(s -> {
+ stream.limit(2000).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
i.incrementAndGet();
log.info(s);
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
index f708c367b..99b95d9e3 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@@ -4,6 +4,11 @@
package eu.dnetlib.dhp.collection.plugin.rest;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
@@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.gson.Gson;
+
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
@@ -25,18 +32,18 @@ class RestCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
- private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
- private final String resumptionType = "count";
- private final String resumptionParam = "from";
- private final String entityXpath = "//hits/hits";
- private final String resumptionXpath = "//hits";
- private final String resultTotalXpath = "//hits/total";
- private final String resultFormatParam = "format";
+ private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
+ private final String resumptionType = "discover";
+ private final String resumptionParam = "skip";
+ private final String entityXpath = "//*[local-name()='data']";
+ private final String resumptionXpath = "";
+ private final String resultTotalXpath = "//*[local-name()='count']";
+ private final String resultFormatParam = "";
private final String resultFormatValue = "json";
- private final String resultSizeParam = "size";
+ private final String resultSizeParam = "top";
private final String resultSizeValue = "10";
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
- private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+ private final String query = "";
// private String query = "=(sources:engrXiv AND type:preprint)";
private final String protocolDescriptor = "rest_json2xml";
@@ -56,6 +63,7 @@ class RestCollectorPluginTest {
params.put("resultSizeValue", resultSizeValue);
params.put("queryParams", query);
params.put("entityXpath", entityXpath);
+ params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
api.setBaseUrl(baseUrl);
api.setParams(params);
@@ -78,4 +86,19 @@ class RestCollectorPluginTest {
log.info("{}", i.intValue());
Assertions.assertTrue(i.intValue() > 0);
}
+
+ @Disabled
+ @Test
+ void testUrl() throws IOException {
+ String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
+ URL url = new URL(url_s);
+ final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+ conn.setRequestMethod("GET");
+ conn.setRequestProperty("User-Agent", "OpenAIRE");
+ Gson gson = new Gson();
+ System.out.println("Request header");
+ System.out.println(gson.toJson(conn.getHeaderFields()));
+ InputStream inputStream = conn.getInputStream();
+
+ }
}
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
index e2d6ad3e7..ed31c2b7e 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@@ -44,7 +44,7 @@ public class RestIteratorTest {
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
- query, entityXpath, authMethod, authToken, resultOffsetParam);
+ query, entityXpath, authMethod, authToken, resultOffsetParam, null);
int i = 20;
while (iterator.hasNext() && i > 0) {
String result = iterator.next();