diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index c2f76cff7..bfec019af 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -70,10 +70,7 @@ com.ibm.icu icu4j - - org.apache.hadoop - hadoop-common - + com.github.sisyphsu dateparser @@ -163,7 +160,7 @@ eu.dnetlib.dhp - ${dhp-schemas.artifact} + dhp-schemas @@ -172,4 +169,23 @@ + + + + spark-34 + + + javax.xml.bind + jaxb-api + 2.2.11 + + + com.sun.xml.ws + jaxws-ri + 2.3.3 + pom + + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java index 997948687..f4ba09f72 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.collection.plugin.rest; +import java.util.Map; import java.util.Optional; import java.util.Spliterator; import java.util.Spliterators; @@ -9,6 +10,8 @@ import java.util.stream.StreamSupport; import org.apache.commons.lang3.StringUtils; +import com.google.gson.Gson; + import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; @@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin { final String entityXpath = api.getParams().get("entityXpath"); final String authMethod = api.getParams().get("authMethod"); final String authToken = api.getParams().get("authToken"); + final String requestHeaderMap = api.getParams().get("requestHeaderMap"); + Gson gson = new Gson(); + Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class); final String resultSizeValue = Optional .ofNullable(api.getParams().get("resultSizeValue")) .filter(StringUtils::isNotBlank) @@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin { if (StringUtils.isBlank(resultFormatValue)) { throw new CollectorException("Param 'resultFormatValue' is null or empty"); } - if (StringUtils.isBlank(queryParams)) { - throw new CollectorException("Param 'queryParams' is null or empty"); - } if (StringUtils.isBlank(entityXpath)) { throw new CollectorException("Param 'entityXpath' is null or empty"); } @@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin { entityXpath, authMethod, authToken, - resultOutputFormat); + resultOutputFormat, + requestHeaders); return StreamSupport .stream( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index 76af6cff1..2518fd92f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -9,6 +9,7 @@ import java.net.URL; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.Iterator; +import java.util.Map; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; @@ -34,6 +35,8 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; +import com.google.common.collect.Maps; + import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; @@ -55,7 +58,7 @@ public class RestIterator implements Iterator { private final HttpClientParams clientParams; - private final String BASIC = "basic"; + private final String AUTHBASIC = "basic"; private final String baseUrl; private final String resumptionType; @@ -89,6 +92,11 @@ public class RestIterator implements Iterator { */ private final String resultOutputFormat; + /* + * Can be used to set additional request headers, like for content negotiation + */ + private Map requestHeaders; + /** * RestIterator class compatible to version 1.3.33 */ @@ -107,7 +115,8 @@ public class RestIterator implements Iterator { final String entityXpath, final String authMethod, final String authToken, - final String resultOutputFormat) { + final String resultOutputFormat, + final Map requestHeaders) { this.clientParams = clientParams; this.baseUrl = baseUrl; @@ -119,6 +128,7 @@ public class RestIterator implements Iterator { this.authMethod = authMethod; this.authToken = authToken; this.resultOutputFormat = resultOutputFormat; + this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap(); this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : ""; @@ -231,25 +241,20 @@ public class RestIterator implements Iterator { final URL qUrl = new URL(query); log.debug("authMethod: {}", this.authMethod); - if ("bearer".equalsIgnoreCase(this.authMethod)) { - log.trace("authMethod before inputStream: {}", resultXml); - final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); - conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken); - conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); - conn.setRequestMethod("GET"); - theHttpInputStream = conn.getInputStream(); - } else if (this.BASIC.equalsIgnoreCase(this.authMethod)) { - log.trace("authMethod before inputStream: {}", resultXml); - final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); - conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken); - conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType()); - conn.setRequestMethod("GET"); - theHttpInputStream = conn.getInputStream(); - } else { - theHttpInputStream = qUrl.openStream(); + if (this.authMethod == "bearer") { + log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); + requestHeaders.put("Authorization", "Bearer " + authToken); + // requestHeaders.put("Content-Type", "application/json"); + } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) { + log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); + requestHeaders.put("Authorization", "Basic " + authToken); + // requestHeaders.put("accept", "application/xml"); } + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestMethod("GET"); + this.setRequestHeader(conn); + resultStream = conn.getInputStream(); - this.resultStream = theHttpInputStream; if ("json".equals(this.resultOutputFormat)) { resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8); resultXml = JsonUtils.convertToXML(resultJson); @@ -380,7 +385,8 @@ public class RestIterator implements Iterator { try { if (this.resultTotal == -1) { this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode)); - if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) { + if ("page".equalsIgnoreCase(this.resumptionType) + && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) { this.resultTotal += 1; } // to correct the upper bound log.info("resultTotal was -1 is now: " + this.resultTotal); @@ -433,6 +439,22 @@ public class RestIterator implements Iterator { } } + /** + * setRequestHeader + * + * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value. + * @param conn + */ + private void setRequestHeader(HttpURLConnection conn) { + if (requestHeaders != null) { + for (String key : requestHeaders.keySet()) { + conn.setRequestProperty(key, requestHeaders.get(key)); + } + log.debug("Set Request Header with: " + requestHeaders); + } + + } + public String getResultFormatValue() { return this.resultFormatValue; } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java index 90f4c7f25..0e64f8bab 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java @@ -39,8 +39,8 @@ public class OsfPreprintCollectorTest { private final String resumptionType = "page"; private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']"; - private final String resultSizeParam = ""; - private final String resultSizeValue = ""; + private final String resultSizeParam = "page[size]"; + private final String resultSizeValue = "100"; private final String resultFormatParam = "format"; private final String resultFormatValue = "json"; @@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest { final AtomicInteger i = new AtomicInteger(0); final Stream stream = this.rcp.collect(this.api, new AggregatorReport()); - stream.limit(200).forEach(s -> { + stream.limit(2000).forEach(s -> { Assertions.assertTrue(s.length() > 0); i.incrementAndGet(); log.info(s); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java index f708c367b..99b95d9e3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java @@ -4,6 +4,11 @@ package eu.dnetlib.dhp.collection.plugin.rest; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; import java.util.HashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; @@ -12,6 +17,8 @@ import org.junit.jupiter.api.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.gson.Gson; + import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.collection.CollectorException; @@ -25,18 +32,18 @@ class RestCollectorPluginTest { private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class); - private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; - private final String resumptionType = "count"; - private final String resumptionParam = "from"; - private final String entityXpath = "//hits/hits"; - private final String resumptionXpath = "//hits"; - private final String resultTotalXpath = "//hits/total"; - private final String resultFormatParam = "format"; + private final String baseUrl = "https://ddh-openapi.worldbank.org/search"; + private final String resumptionType = "discover"; + private final String resumptionParam = "skip"; + private final String entityXpath = "//*[local-name()='data']"; + private final String resumptionXpath = ""; + private final String resultTotalXpath = "//*[local-name()='count']"; + private final String resultFormatParam = ""; private final String resultFormatValue = "json"; - private final String resultSizeParam = "size"; + private final String resultSizeParam = "top"; private final String resultSizeValue = "10"; // private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; - private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29"; + private final String query = ""; // private String query = "=(sources:engrXiv AND type:preprint)"; private final String protocolDescriptor = "rest_json2xml"; @@ -56,6 +63,7 @@ class RestCollectorPluginTest { params.put("resultSizeValue", resultSizeValue); params.put("queryParams", query); params.put("entityXpath", entityXpath); + params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}"); api.setBaseUrl(baseUrl); api.setParams(params); @@ -78,4 +86,19 @@ class RestCollectorPluginTest { log.info("{}", i.intValue()); Assertions.assertTrue(i.intValue() > 0); } + + @Disabled + @Test + void testUrl() throws IOException { + String url_s = "https://ddh-openapi.worldbank.org/search?&top=10"; + URL url = new URL(url_s); + final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("GET"); + conn.setRequestProperty("User-Agent", "OpenAIRE"); + Gson gson = new Gson(); + System.out.println("Request header"); + System.out.println(gson.toJson(conn.getHeaderFields())); + InputStream inputStream = conn.getInputStream(); + + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java index e2d6ad3e7..ed31c2b7e 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java @@ -44,7 +44,7 @@ public class RestIteratorTest { final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, - query, entityXpath, authMethod, authToken, resultOffsetParam); + query, entityXpath, authMethod, authToken, resultOffsetParam, null); int i = 20; while (iterator.hasNext() && i > 0) { String result = iterator.next();