Rest collector plugin on hadoop supports a new param to pass request headers

This commit is contained in:
Alessia 2024-05-29 15:41:36 +02:00
parent 75d5ddb999
commit 1b165a14a0
4 changed files with 83 additions and 34 deletions

View File

@ -1,12 +1,14 @@
package eu.dnetlib.dhp.collection.plugin.rest; package eu.dnetlib.dhp.collection.plugin.rest;
import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.Spliterator; import java.util.Spliterator;
import java.util.Spliterators; import java.util.Spliterators;
import java.util.stream.Stream; import java.util.stream.Stream;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import com.google.gson.Gson;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.ApiDescriptor;
@ -47,6 +49,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
final String entityXpath = api.getParams().get("entityXpath"); final String entityXpath = api.getParams().get("entityXpath");
final String authMethod = api.getParams().get("authMethod"); final String authMethod = api.getParams().get("authMethod");
final String authToken = api.getParams().get("authToken"); final String authToken = api.getParams().get("authToken");
final String requestHeaderMap = api.getParams().get("requestHeaderMap");
Gson gson = new Gson();
Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
final String resultSizeValue = Optional final String resultSizeValue = Optional
.ofNullable(api.getParams().get("resultSizeValue")) .ofNullable(api.getParams().get("resultSizeValue"))
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)
@ -64,9 +69,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
if (StringUtils.isBlank(resultFormatValue)) { if (StringUtils.isBlank(resultFormatValue)) {
throw new CollectorException("Param 'resultFormatValue' is null or empty"); throw new CollectorException("Param 'resultFormatValue' is null or empty");
} }
if (StringUtils.isBlank(queryParams)) {
throw new CollectorException("Param 'queryParams' is null or empty");
}
if (StringUtils.isBlank(entityXpath)) { if (StringUtils.isBlank(entityXpath)) {
throw new CollectorException("Param 'entityXpath' is null or empty"); throw new CollectorException("Param 'entityXpath' is null or empty");
} }
@ -92,7 +94,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
entityXpath, entityXpath,
authMethod, authMethod,
authToken, authToken,
resultOutputFormat); resultOutputFormat,
requestHeaders);
return StreamSupport return StreamSupport
.stream( .stream(

View File

@ -9,6 +9,7 @@ import java.net.URL;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map;
import java.util.Queue; import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.PriorityBlockingQueue;
@ -24,6 +25,7 @@ import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory; import javax.xml.xpath.XPathFactory;
import com.google.common.collect.Maps;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHeaders; import org.apache.http.HttpHeaders;
@ -49,13 +51,14 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
*/ */
public class RestIterator implements Iterator<String> { public class RestIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(RestIterator.class); private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
public static final String UTF_8 = "UTF-8"; public static final String UTF_8 = "UTF-8";
private static final int MAX_ATTEMPTS = 5; private static final int MAX_ATTEMPTS = 5;
private final HttpClientParams clientParams; private final HttpClientParams clientParams;
private final String BASIC = "basic"; private final String AUTHBASIC = "basic";
private final String baseUrl; private final String baseUrl;
private final String resumptionType; private final String resumptionType;
@ -89,6 +92,12 @@ public class RestIterator implements Iterator<String> {
*/ */
private final String resultOutputFormat; private final String resultOutputFormat;
/*
Can be used to set additional request headers, like for content negotiation
*/
private Map<String, String> requestHeaders;
/** /**
* RestIterator class compatible to version 1.3.33 * RestIterator class compatible to version 1.3.33
*/ */
@ -107,7 +116,8 @@ public class RestIterator implements Iterator<String> {
final String entityXpath, final String entityXpath,
final String authMethod, final String authMethod,
final String authToken, final String authToken,
final String resultOutputFormat) { final String resultOutputFormat,
final Map<String, String> requestHeaders) {
this.clientParams = clientParams; this.clientParams = clientParams;
this.baseUrl = baseUrl; this.baseUrl = baseUrl;
@ -119,6 +129,7 @@ public class RestIterator implements Iterator<String> {
this.authMethod = authMethod; this.authMethod = authMethod;
this.authToken = authToken; this.authToken = authToken;
this.resultOutputFormat = resultOutputFormat; this.resultOutputFormat = resultOutputFormat;
this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
: ""; : "";
@ -231,25 +242,20 @@ public class RestIterator implements Iterator<String> {
final URL qUrl = new URL(query); final URL qUrl = new URL(query);
log.debug("authMethod: {}", this.authMethod); log.debug("authMethod: {}", this.authMethod);
if ("bearer".equalsIgnoreCase(this.authMethod)) { if (this.authMethod == "bearer") {
log.trace("authMethod before inputStream: {}", resultXml); log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); requestHeaders.put("Authorization", "Bearer " + authToken);
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken); //requestHeaders.put("Content-Type", "application/json");
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
conn.setRequestMethod("GET"); log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
theHttpInputStream = conn.getInputStream(); requestHeaders.put("Authorization", "Basic " + authToken);
} else if (this.BASIC.equalsIgnoreCase(this.authMethod)) { //requestHeaders.put("accept", "application/xml");
log.trace("authMethod before inputStream: {}", resultXml);
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else {
theHttpInputStream = qUrl.openStream();
} }
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestMethod("GET");
this.setRequestHeader(conn);
resultStream = conn.getInputStream();
this.resultStream = theHttpInputStream;
if ("json".equals(this.resultOutputFormat)) { if ("json".equals(this.resultOutputFormat)) {
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8); resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
resultXml = JsonUtils.convertToXML(resultJson); resultXml = JsonUtils.convertToXML(resultJson);
@ -380,7 +386,7 @@ public class RestIterator implements Iterator<String> {
try { try {
if (this.resultTotal == -1) { if (this.resultTotal == -1) {
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode)); this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) { if ("page".equalsIgnoreCase(this.resumptionType) && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
this.resultTotal += 1; this.resultTotal += 1;
} // to correct the upper bound } // to correct the upper bound
log.info("resultTotal was -1 is now: " + this.resultTotal); log.info("resultTotal was -1 is now: " + this.resultTotal);
@ -433,6 +439,22 @@ public class RestIterator implements Iterator<String> {
} }
} }
/**
* setRequestHeader
*
* setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
* @param conn
*/
private void setRequestHeader(HttpURLConnection conn) {
if (requestHeaders != null) {
for (String key : requestHeaders.keySet()) {
conn.setRequestProperty(key, requestHeaders.get(key));
}
log.debug("Set Request Header with: " + requestHeaders);
}
}
public String getResultFormatValue() { public String getResultFormatValue() {
return this.resultFormatValue; return this.resultFormatValue;
} }

View File

@ -4,10 +4,16 @@
package eu.dnetlib.dhp.collection.plugin.rest; package eu.dnetlib.dhp.collection.plugin.rest;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap; import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream; import java.util.stream.Stream;
import com.google.gson.Gson;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -25,18 +31,18 @@ class RestCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class); private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
private final String resumptionType = "count"; private final String resumptionType = "discover";
private final String resumptionParam = "from"; private final String resumptionParam = "skip";
private final String entityXpath = "//hits/hits"; private final String entityXpath = "//*[local-name()='data']";
private final String resumptionXpath = "//hits"; private final String resumptionXpath = "";
private final String resultTotalXpath = "//hits/total"; private final String resultTotalXpath = "//*[local-name()='count']";
private final String resultFormatParam = "format"; private final String resultFormatParam = "";
private final String resultFormatValue = "json"; private final String resultFormatValue = "json";
private final String resultSizeParam = "size"; private final String resultSizeParam = "top";
private final String resultSizeValue = "10"; private final String resultSizeValue = "10";
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; // private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29"; private final String query = "";
// private String query = "=(sources:engrXiv AND type:preprint)"; // private String query = "=(sources:engrXiv AND type:preprint)";
private final String protocolDescriptor = "rest_json2xml"; private final String protocolDescriptor = "rest_json2xml";
@ -56,10 +62,12 @@ class RestCollectorPluginTest {
params.put("resultSizeValue", resultSizeValue); params.put("resultSizeValue", resultSizeValue);
params.put("queryParams", query); params.put("queryParams", query);
params.put("entityXpath", entityXpath); params.put("entityXpath", entityXpath);
params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
api.setBaseUrl(baseUrl); api.setBaseUrl(baseUrl);
api.setParams(params); api.setParams(params);
rcp = new RestCollectorPlugin(new HttpClientParams()); rcp = new RestCollectorPlugin(new HttpClientParams());
} }
@ -78,4 +86,20 @@ class RestCollectorPluginTest {
log.info("{}", i.intValue()); log.info("{}", i.intValue());
Assertions.assertTrue(i.intValue() > 0); Assertions.assertTrue(i.intValue() > 0);
} }
@Disabled
@Test
void testUrl() throws IOException {
String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
URL url = new URL(url_s);
final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "OpenAIRE");
Gson gson = new Gson();
System.out.println("Request header");
System.out.println(gson.toJson(conn.getHeaderFields()));
InputStream inputStream = conn.getInputStream();
}
} }

View File

@ -44,7 +44,7 @@ public class RestIteratorTest {
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam, final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
query, entityXpath, authMethod, authToken, resultOffsetParam); query, entityXpath, authMethod, authToken, resultOffsetParam, null);
int i = 20; int i = 20;
while (iterator.hasNext() && i > 0) { while (iterator.hasNext() && i > 0) {
String result = iterator.next(); String result = iterator.next();