From 39810c6e7e22f955d5258f5924ae17c16160e3e6 Mon Sep 17 00:00:00 2001 From: Alessia Date: Wed, 29 May 2024 15:41:36 +0200 Subject: [PATCH 1/3] Rest collector plugin on hadoop supports a new param to pass request headers --- .../plugin/rest/RestCollectorPlugin.java | 11 ++-- .../collection/plugin/rest/RestIterator.java | 62 +++++++++++++------ .../plugin/rest/RestCollectorPluginTest.java | 42 ++++++++++--- .../plugin/rest/RestIteratorTest.java | 2 +- 4 files changed, 83 insertions(+), 34 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java index 997948687..8445e49e0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -1,12 +1,14 @@ package eu.dnetlib.dhp.collection.plugin.rest; +import java.util.Map; import java.util.Optional; import java.util.Spliterator; import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import com.google.gson.Gson; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.collection.ApiDescriptor; @@ -47,6 +49,9 @@ public class RestCollectorPlugin implements CollectorPlugin { final String entityXpath = api.getParams().get("entityXpath"); final String authMethod = api.getParams().get("authMethod"); final String authToken = api.getParams().get("authToken"); + final String requestHeaderMap = api.getParams().get("requestHeaderMap"); + Gson gson = new Gson(); + Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class); final String resultSizeValue = Optional .ofNullable(api.getParams().get("resultSizeValue")) .filter(StringUtils::isNotBlank) @@ -64,9 +69,6 @@ public class RestCollectorPlugin implements CollectorPlugin { if (StringUtils.isBlank(resultFormatValue)) { throw new CollectorException("Param 'resultFormatValue' is null or empty"); } - if (StringUtils.isBlank(queryParams)) { - throw new CollectorException("Param 'queryParams' is null or empty"); - } if (StringUtils.isBlank(entityXpath)) { throw new CollectorException("Param 'entityXpath' is null or empty"); } @@ -92,7 +94,8 @@ public class RestCollectorPlugin implements CollectorPlugin { entityXpath, authMethod, authToken, - resultOutputFormat); + resultOutputFormat, + requestHeaders); return StreamSupport .stream( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index 76af6cff1..e51c9eb1b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -9,6 +9,7 @@ import java.net.URL; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.Iterator; +import java.util.Map; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; @@ -24,6 +25,7 @@ import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; +import com.google.common.collect.Maps; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHeaders; @@ -49,13 +51,14 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams; */ public class RestIterator implements Iterator { + private static final Logger log = LoggerFactory.getLogger(RestIterator.class); public static final String UTF_8 = "UTF-8"; private static final int MAX_ATTEMPTS = 5; private final HttpClientParams clientParams; - private final String BASIC = "basic"; + private final String AUTHBASIC = "basic"; private final String baseUrl; private final String resumptionType; @@ -89,6 +92,12 @@ public class RestIterator implements Iterator { */ private final String resultOutputFormat; + /* + Can be used to set additional request headers, like for content negotiation + */ + private Map requestHeaders; + + /** * RestIterator class compatible to version 1.3.33 */ @@ -107,7 +116,8 @@ public class RestIterator implements Iterator { final String entityXpath, final String authMethod, final String authToken, - final String resultOutputFormat) { + final String resultOutputFormat, + final Map requestHeaders) { this.clientParams = clientParams; this.baseUrl = baseUrl; @@ -119,6 +129,7 @@ public class RestIterator implements Iterator { this.authMethod = authMethod; this.authToken = authToken; this.resultOutputFormat = resultOutputFormat; + this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap(); this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : ""; @@ -231,25 +242,20 @@ public class RestIterator implements Iterator { final URL qUrl = new URL(query); log.debug("authMethod: {}", this.authMethod); - if ("bearer".equalsIgnoreCase(this.authMethod)) { - log.trace("authMethod before inputStream: {}", resultXml); - final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); - conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken); - conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); - conn.setRequestMethod("GET"); - theHttpInputStream = conn.getInputStream(); - } else if (this.BASIC.equalsIgnoreCase(this.authMethod)) { - log.trace("authMethod before inputStream: {}", resultXml); - final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); - conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken); - conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType()); - conn.setRequestMethod("GET"); - theHttpInputStream = conn.getInputStream(); - } else { - theHttpInputStream = qUrl.openStream(); + if (this.authMethod == "bearer") { + log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); + requestHeaders.put("Authorization", "Bearer " + authToken); + //requestHeaders.put("Content-Type", "application/json"); + } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) { + log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); + requestHeaders.put("Authorization", "Basic " + authToken); + //requestHeaders.put("accept", "application/xml"); } + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestMethod("GET"); + this.setRequestHeader(conn); + resultStream = conn.getInputStream(); - this.resultStream = theHttpInputStream; if ("json".equals(this.resultOutputFormat)) { resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8); resultXml = JsonUtils.convertToXML(resultJson); @@ -380,7 +386,7 @@ public class RestIterator implements Iterator { try { if (this.resultTotal == -1) { this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode)); - if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) { + if ("page".equalsIgnoreCase(this.resumptionType) && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) { this.resultTotal += 1; } // to correct the upper bound log.info("resultTotal was -1 is now: " + this.resultTotal); @@ -433,6 +439,22 @@ public class RestIterator implements Iterator { } } + /** + * setRequestHeader + * + * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value. + * @param conn + */ + private void setRequestHeader(HttpURLConnection conn) { + if (requestHeaders != null) { + for (String key : requestHeaders.keySet()) { + conn.setRequestProperty(key, requestHeaders.get(key)); + } + log.debug("Set Request Header with: " + requestHeaders); + } + + } + public String getResultFormatValue() { return this.resultFormatValue; } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java index f708c367b..a9fc325c3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java @@ -4,10 +4,16 @@ package eu.dnetlib.dhp.collection.plugin.rest; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; import java.util.HashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; +import com.google.gson.Gson; import org.junit.jupiter.api.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,18 +31,18 @@ class RestCollectorPluginTest { private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class); - private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; - private final String resumptionType = "count"; - private final String resumptionParam = "from"; - private final String entityXpath = "//hits/hits"; - private final String resumptionXpath = "//hits"; - private final String resultTotalXpath = "//hits/total"; - private final String resultFormatParam = "format"; + private final String baseUrl = "https://ddh-openapi.worldbank.org/search"; + private final String resumptionType = "discover"; + private final String resumptionParam = "skip"; + private final String entityXpath = "//*[local-name()='data']"; + private final String resumptionXpath = ""; + private final String resultTotalXpath = "//*[local-name()='count']"; + private final String resultFormatParam = ""; private final String resultFormatValue = "json"; - private final String resultSizeParam = "size"; + private final String resultSizeParam = "top"; private final String resultSizeValue = "10"; // private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; - private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29"; + private final String query = ""; // private String query = "=(sources:engrXiv AND type:preprint)"; private final String protocolDescriptor = "rest_json2xml"; @@ -56,10 +62,12 @@ class RestCollectorPluginTest { params.put("resultSizeValue", resultSizeValue); params.put("queryParams", query); params.put("entityXpath", entityXpath); + params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}"); api.setBaseUrl(baseUrl); api.setParams(params); + rcp = new RestCollectorPlugin(new HttpClientParams()); } @@ -78,4 +86,20 @@ class RestCollectorPluginTest { log.info("{}", i.intValue()); Assertions.assertTrue(i.intValue() > 0); } + + @Disabled + @Test + void testUrl() throws IOException { + String url_s = "https://ddh-openapi.worldbank.org/search?&top=10"; + URL url = new URL(url_s); + final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("GET"); + conn.setRequestProperty("User-Agent", "OpenAIRE"); + Gson gson = new Gson(); + System.out.println("Request header"); + System.out.println(gson.toJson(conn.getHeaderFields())); + InputStream inputStream = conn.getInputStream(); + + + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java index e2d6ad3e7..ed31c2b7e 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java @@ -44,7 +44,7 @@ public class RestIteratorTest { final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, - query, entityXpath, authMethod, authToken, resultOffsetParam); + query, entityXpath, authMethod, authToken, resultOffsetParam, null); int i = 20; while (iterator.hasNext() && i > 0) { String result = iterator.next(); From 07e7b9315c0d6014edbbeeeece9b25388ca2f5d3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 29 May 2024 15:50:07 +0200 Subject: [PATCH 2/3] code formatting --- .../plugin/rest/RestCollectorPlugin.java | 5 +++-- .../dhp/collection/plugin/rest/RestIterator.java | 16 ++++++++-------- .../plugin/rest/RestCollectorPluginTest.java | 5 ++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java index 8445e49e0..f4ba09f72 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -8,9 +8,10 @@ import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; -import com.google.gson.Gson; import org.apache.commons.lang3.StringUtils; +import com.google.gson.Gson; + import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; @@ -95,7 +96,7 @@ public class RestCollectorPlugin implements CollectorPlugin { authMethod, authToken, resultOutputFormat, - requestHeaders); + requestHeaders); return StreamSupport .stream( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index e51c9eb1b..2518fd92f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -25,7 +25,6 @@ import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; -import com.google.common.collect.Maps; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHeaders; @@ -36,6 +35,8 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; +import com.google.common.collect.Maps; + import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; @@ -51,7 +52,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams; */ public class RestIterator implements Iterator { - private static final Logger log = LoggerFactory.getLogger(RestIterator.class); public static final String UTF_8 = "UTF-8"; private static final int MAX_ATTEMPTS = 5; @@ -93,11 +93,10 @@ public class RestIterator implements Iterator { private final String resultOutputFormat; /* - Can be used to set additional request headers, like for content negotiation - */ + * Can be used to set additional request headers, like for content negotiation + */ private Map requestHeaders; - /** * RestIterator class compatible to version 1.3.33 */ @@ -245,11 +244,11 @@ public class RestIterator implements Iterator { if (this.authMethod == "bearer") { log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); requestHeaders.put("Authorization", "Bearer " + authToken); - //requestHeaders.put("Content-Type", "application/json"); + // requestHeaders.put("Content-Type", "application/json"); } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) { log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); requestHeaders.put("Authorization", "Basic " + authToken); - //requestHeaders.put("accept", "application/xml"); + // requestHeaders.put("accept", "application/xml"); } HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); conn.setRequestMethod("GET"); @@ -386,7 +385,8 @@ public class RestIterator implements Iterator { try { if (this.resultTotal == -1) { this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode)); - if ("page".equalsIgnoreCase(this.resumptionType) && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) { + if ("page".equalsIgnoreCase(this.resumptionType) + && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) { this.resultTotal += 1; } // to correct the upper bound log.info("resultTotal was -1 is now: " + this.resultTotal); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java index a9fc325c3..99b95d9e3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java @@ -13,11 +13,12 @@ import java.util.HashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; -import com.google.gson.Gson; import org.junit.jupiter.api.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.gson.Gson; + import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.collection.CollectorException; @@ -67,7 +68,6 @@ class RestCollectorPluginTest { api.setBaseUrl(baseUrl); api.setParams(params); - rcp = new RestCollectorPlugin(new HttpClientParams()); } @@ -100,6 +100,5 @@ class RestCollectorPluginTest { System.out.println(gson.toJson(conn.getHeaderFields())); InputStream inputStream = conn.getInputStream(); - } } From 97c970646964efc621e05647a60dc781864b5548 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 2 Aug 2024 15:47:56 +0200 Subject: [PATCH 3/3] minors --- dhp-common/pom.xml | 26 +++++++++++++++---- .../plugin/rest/OsfPreprintCollectorTest.java | 6 ++--- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index c2f76cff7..bfec019af 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -70,10 +70,7 @@ com.ibm.icu icu4j - - org.apache.hadoop - hadoop-common - + com.github.sisyphsu dateparser @@ -163,7 +160,7 @@ eu.dnetlib.dhp - ${dhp-schemas.artifact} + dhp-schemas @@ -172,4 +169,23 @@ + + + + spark-34 + + + javax.xml.bind + jaxb-api + 2.2.11 + + + com.sun.xml.ws + jaxws-ri + 2.3.3 + pom + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java index 90f4c7f25..0e64f8bab 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java @@ -39,8 +39,8 @@ public class OsfPreprintCollectorTest { private final String resumptionType = "page"; private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']"; - private final String resultSizeParam = ""; - private final String resultSizeValue = ""; + private final String resultSizeParam = "page[size]"; + private final String resultSizeValue = "100"; private final String resultFormatParam = "format"; private final String resultFormatValue = "json"; @@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest { final AtomicInteger i = new AtomicInteger(0); final Stream stream = this.rcp.collect(this.api, new AggregatorReport()); - stream.limit(200).forEach(s -> { + stream.limit(2000).forEach(s -> { Assertions.assertTrue(s.length() > 0); i.incrementAndGet(); log.info(s);