1
0
Fork 0

RestCollectorPlugin imported from dne45

This commit is contained in:
Claudio Atzori 2021-03-08 09:44:09 +01:00
parent fa7930d2e2
commit acbe3119a4
9 changed files with 768 additions and 1 deletions

View File

@ -86,6 +86,11 @@
<artifactId>jaxen</artifactId> <artifactId>jaxen</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv --> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
public class CollectorWorker extends ReportingJob { public class CollectorWorker extends ReportingJob {
@ -109,6 +110,8 @@ public class CollectorWorker extends ReportingJob {
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) { switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
case oai: case oai:
return new OaiCollectorPlugin(clientParams); return new OaiCollectorPlugin(clientParams);
case rest_json2xml:
return new RestCollectorPlugin(clientParams);
case other: case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type")) .ofNullable(api.getParams().get("other_plugin_type"))

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.collection;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class JsonUtils {
private static final Log log = LogFactory.getLog(JsonUtils.class);
public static final String wrapName = "recordWrap";
/**
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
* and work-around for the JSON to XML converting of org.json.XML-package.
*
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
*
* @param jsonInput
* @return convertedJsonKeynameOutput
*/
public String syntaxConvertJsonKeyNames(String jsonInput) {
log.trace("before convertJsonKeyNames: " + jsonInput);
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
// replace ' 's in JSON Namens with '_'
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
}
// replace forward-slash (sign '/' ) in JSON Names with '_'
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
}
// replace '(' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
}
// replace ')' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
}
// add prefix of startNumbers in JSON Keynames with 'n_'
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
}
// add prefix of only numbers in JSON Keynames with 'm_'
while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
}
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
}
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
// }
// replace '=' in JSON Keynames with '-'
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
}
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
return jsonInput;
}
public String convertToXML(final String jsonRecord) {
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
log.trace("before inputStream: " + resultXml);
resultXml = XmlCleaner.cleanAllEntities(resultXml);
log.trace("after cleaning: " + resultXml);
return resultXml;
}
}

View File

@ -10,7 +10,7 @@ import eu.dnetlib.dhp.collection.CollectorException;
public interface CollectorPlugin { public interface CollectorPlugin {
enum NAME { enum NAME {
oai, other; oai, other, rest_json2xml;
public enum OTHER_NAME { public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb mdstore_mongodb_dump, mdstore_mongodb

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.collection.plugin.rest;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
/**
* TODO: delegate HTTP requests to the common HttpConnector2 implementation.
*
* @author js, Andreas Czerniak
* @date 2020-04-09
*
*/
public class RestCollectorPlugin implements CollectorPlugin {
private HttpClientParams clientParams;
public RestCollectorPlugin(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
final String baseUrl = api.getBaseUrl();
final String resumptionType = api.getParams().get("resumptionType");
final String resumptionParam = api.getParams().get("resumptionParam");
final String resumptionXpath = api.getParams().get("resumptionXpath");
final String resultTotalXpath = api.getParams().get("resultTotalXpath");
final String resultFormatParam = api.getParams().get("resultFormatParam");
final String resultFormatValue = api.getParams().get("resultFormatValue");
final String resultSizeParam = api.getParams().get("resultSizeParam");
final String resultSizeValue = (StringUtils.isBlank(api.getParams().get("resultSizeValue"))) ? "100"
: api.getParams().get("resultSizeValue");
final String queryParams = api.getParams().get("queryParams");
final String entityXpath = api.getParams().get("entityXpath");
final String authMethod = api.getParams().get("authMethod");
final String authToken = api.getParams().get("authToken");
if (StringUtils.isBlank(baseUrl)) {
throw new CollectorException("Param 'baseUrl' is null or empty");
}
if (StringUtils.isBlank(resumptionType)) {
throw new CollectorException("Param 'resumptionType' is null or empty");
}
if (StringUtils.isBlank(resumptionParam)) {
throw new CollectorException("Param 'resumptionParam' is null or empty");
}
if (StringUtils.isBlank(resultFormatValue)) {
throw new CollectorException("Param 'resultFormatValue' is null or empty");
}
if (StringUtils.isBlank(queryParams)) {
throw new CollectorException("Param 'queryParams' is null or empty");
}
if (StringUtils.isBlank(entityXpath)) {
throw new CollectorException("Param 'entityXpath' is null or empty");
}
RestIterator it = new RestIterator(
getClientParams(),
baseUrl,
resumptionType,
resumptionParam,
resumptionXpath,
resultTotalXpath,
resultFormatParam,
resultFormatValue,
resultSizeParam,
resultSizeValue,
queryParams,
entityXpath,
authMethod,
authToken);
return StreamSupport
.stream(
Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
}
public HttpClientParams getClientParams() {
return clientParams;
}
}

View File

@ -0,0 +1,442 @@
package eu.dnetlib.dhp.collection.plugin.rest;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpHeaders;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
import eu.dnetlib.dhp.collection.JsonUtils;
/**
* log.debug(...) equal to log.trace(...) in the application-logs
* <p>
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
*
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
* @date 2020-04-09
*
*/
public class RestIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(RestIterator.class);
private HttpClientParams clientParams;
private final String BASIC = "basic";
private JsonUtils jsonUtils;
private String baseUrl;
private String resumptionType;
private String resumptionParam;
private String resultFormatValue;
private String queryParams;
private int resultSizeValue;
private int resumptionInt = 0; // integer resumption token (first record to harvest)
private int resultTotal = -1;
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
// or token scanned from results)
private InputStream resultStream;
private Transformer transformer;
private XPath xpath;
private String query;
private XPathExpression xprResultTotalPath;
private XPathExpression xprResumptionPath;
private XPathExpression xprEntity;
private String queryFormat;
private String querySize;
private String authMethod;
private String authToken;
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
private int discoverResultSize = 0;
private int pagination = 1;
/**
* RestIterator class
*
* compatible to version before 1.3.33
*
* @param baseUrl
* @param resumptionType
* @param resumptionParam
* @param resumptionXpath
* @param resultTotalXpath
* @param resultFormatParam
* @param resultFormatValue
* @param resultSizeParam
* @param resultSizeValueStr
* @param queryParams
* @param entityXpath
*/
public RestIterator(
final HttpClientParams clientParams,
final String baseUrl,
final String resumptionType,
final String resumptionParam,
final String resumptionXpath,
final String resultTotalXpath,
final String resultFormatParam,
final String resultFormatValue,
final String resultSizeParam,
final String resultSizeValueStr,
final String queryParams,
final String entityXpath) {
this(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath,
resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValueStr, queryParams, entityXpath, "",
"");
}
public RestIterator(
final HttpClientParams clientParams,
final String baseUrl,
final String resumptionType,
final String resumptionParam,
final String resumptionXpath,
final String resultTotalXpath,
final String resultFormatParam,
final String resultFormatValue,
final String resultSizeParam,
final String resultSizeValueStr,
final String queryParams,
final String entityXpath,
final String authMethod,
final String authToken,
final String resultOffsetParam) {
this(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath,
resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValueStr, queryParams, entityXpath, "",
"");
}
/** RestIterator class
* compatible to version 1.3.33
*/
public RestIterator(
final HttpClientParams clientParams,
final String baseUrl,
final String resumptionType,
final String resumptionParam,
final String resumptionXpath,
final String resultTotalXpath,
final String resultFormatParam,
final String resultFormatValue,
final String resultSizeParam,
final String resultSizeValueStr,
final String queryParams,
final String entityXpath,
final String authMethod,
final String authToken) {
this.clientParams = clientParams;
this.jsonUtils = new JsonUtils();
this.baseUrl = baseUrl;
this.resumptionType = resumptionType;
this.resumptionParam = resumptionParam;
this.resultFormatValue = resultFormatValue;
this.queryParams = queryParams;
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
this.authMethod = authMethod;
this.authToken = authToken;
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
: "";
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
try {
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
} catch (Exception e) {
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
}
initQueue();
}
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
throws TransformerConfigurationException, XPathExpressionException {
transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
xpath = XPathFactory.newInstance().newXPath();
xprResultTotalPath = xpath.compile(resultTotalXpath);
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
xprEntity = xpath.compile(entityXpath);
}
private void initQueue() {
query = baseUrl + "?" + queryParams + querySize + queryFormat;
}
private void disconnect() {
// TODO close inputstream
}
/*
* (non-Javadoc)
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
if (recordQueue.isEmpty() && query.isEmpty()) {
disconnect();
return false;
} else {
return true;
}
}
/*
* (non-Javadoc)
* @see java.util.Iterator#next()
*/
@Override
public String next() {
synchronized (recordQueue) {
while (recordQueue.isEmpty() && !query.isEmpty()) {
try {
log.debug("get Query: " + query);
query = downloadPage(query);
log.debug("next queryURL from downloadPage(): " + query);
} catch (CollectorException e) {
log.debug("CollectorPlugin.next()-Exception: " + e);
throw new RuntimeException(e);
}
}
return recordQueue.poll();
}
}
/*
* download page and return nextQuery
*/
private String downloadPage(String query) throws CollectorException {
String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String emptyXml = resultXml + "<" + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">";
Node resultNode = null;
NodeList nodeList = null;
InputStream theHttpInputStream;
// check if cursor=* is initial set otherwise add it to the queryParam URL
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
log.debug("check resumptionType deep-cursor and check cursor=*?" + query);
if (!query.contains("&cursor=")) {
query += "&cursor=*";
}
}
try {
URL qUrl = new URL(query);
log.debug("authMethod :" + authMethod);
if (this.authMethod == "bearer") {
log.trace("authMethod before inputStream: " + resultXml);
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/json");
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
log.trace("authMethod before inputStream: " + resultXml);
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
conn.setRequestProperty(HttpHeaders.ACCEPT, "application/xml");
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else {
theHttpInputStream = qUrl.openStream();
}
resultStream = theHttpInputStream;
if ("json".equalsIgnoreCase(resultFormatValue)) {
resultJson = IOUtils.toString(resultStream, "UTF-8");
resultXml = jsonUtils.convertToXML(resultJson);
resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
}
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
log.debug("nodeList.length: " + nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
StringWriter sw = new StringWriter();
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
recordQueue.add(sw.toString());
}
} else {
log.info("resultXml is equal with emptyXml");
}
resumptionInt += resultSizeValue;
String qUrlArgument = "";
switch (resumptionType.toLowerCase()) {
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
resumptionStr = xprResumptionPath.evaluate(resultNode);
break;
case "count": // begin at one step for all records, iterate over items
resumptionStr = Integer.toString(resumptionInt);
break;
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
if (resultSizeValue < 2) {
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
}
qUrlArgument = qUrl.getQuery();
String[] arrayQUrlArgument = qUrlArgument.split("&");
int urlOldResumptionSize = 0;
for (String arrayUrlArgStr : arrayQUrlArgument) {
if (arrayUrlArgStr.startsWith(resumptionParam)) {
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
if (isInteger(resumptionKeyValue[1])) {
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
} else {
log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
}
}
}
if (((emptyXml).equalsIgnoreCase(resultXml))
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
// resumptionStr = "";
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
}
resultTotal = discoverResultSize;
} else {
resumptionStr = Integer.toString(resumptionInt);
resultTotal = resumptionInt + 1;
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
}
}
log.debug("discoverResultSize: " + discoverResultSize);
break;
case "pagination":
case "page": // pagination, iterate over page numbers
pagination += 1;
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
} else {
resultTotal = discoverResultSize;
pagination = discoverResultSize;
}
resumptionInt = pagination;
resumptionStr = Integer.toString(resumptionInt);
break;
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
// solr)
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
// deep-cursor, Param 'resultSizeValue' is less than 2");}
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
queryParams = queryParams.replace("&cursor=*", "");
// terminating if length of nodeList is 0
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
} else {
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
// because the iteration is over
// real length and the
// resultSizeValue is added before
// the switch()
}
discoverResultSize = nodeList.getLength();
log
.debug(
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
break;
default: // otherwise: abort
// resultTotal = resumptionInt;
break;
}
} catch (Exception e) {
log.error(e);
throw new IllegalStateException("collection failed: " + e.getMessage());
}
try {
if (resultTotal == -1) {
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
if (resumptionType.toLowerCase().equals("page") && !BASIC.equalsIgnoreCase(authMethod)) {
resultTotal += 1;
} // to correct the upper bound
log.info("resultTotal was -1 is now: " + resultTotal);
}
} catch (Exception e) {
log.error(e);
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
}
log.debug("resultTotal: " + resultTotal);
log.debug("resInt: " + resumptionInt);
String nextQuery;
if (resumptionInt <= resultTotal) {
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
+ queryFormat;
} else {
nextQuery = "";
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
// resumptionInt and prevent a NullPointer Exception at mdStore
}
log.debug("nextQueryUrl: " + nextQuery);
return nextQuery;
}
private boolean isInteger(String s) {
boolean isValidInteger = false;
try {
Integer.parseInt(s);
// s is a valid integer
isValidInteger = true;
} catch (NumberFormatException ex) {
// s is not an integer
}
return isValidInteger;
}
// Method to encode a string value using `UTF-8` encoding scheme
private String encodeValue(String value) {
try {
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(ex.getCause());
}
}
}

View File

@ -0,0 +1,81 @@
/**
*
*/
package eu.dnetlib.dhp.collection.plugin.rest;
import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.CollectorException;
import eu.dnetlib.dhp.collection.HttpClientParams;
/**
* @author js, Andreas Czerniak
*
*/
public class RestCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
private String resumptionType = "count";
private String resumptionParam = "from";
private String entityXpath = "//hits/hits";
private String resumptionXpath = "//hits";
private String resultTotalXpath = "//hits/total";
private String resultFormatParam = "format";
private String resultFormatValue = "json";
private String resultSizeParam = "size";
private String resultSizeValue = "10";
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
private String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
// private String query = "=(sources:engrXiv AND type:preprint)";
private String protocolDescriptor = "rest_json2xml";
private ApiDescriptor api = new ApiDescriptor();
private RestCollectorPlugin rcp;
@BeforeEach
public void setUp() {
HashMap<String, String> params = new HashMap<>();
params.put("resumptionType", resumptionType);
params.put("resumptionParam", resumptionParam);
params.put("resumptionXpath", resumptionXpath);
params.put("resultTotalXpath", resultTotalXpath);
params.put("resultFormatParam", resultFormatParam);
params.put("resultFormatValue", resultFormatValue);
params.put("resultSizeParam", resultSizeParam);
params.put("resultSizeValue", resultSizeValue);
params.put("queryParams", query);
params.put("entityXpath", entityXpath);
api.setBaseUrl(baseUrl);
api.setParams(params);
rcp = new RestCollectorPlugin(new HttpClientParams());
}
@Disabled
@Test
public void test() throws CollectorException {
AtomicInteger i = new AtomicInteger(0);
final Stream<String> stream = rcp.collect(api, new AggregatorReport());
stream.limit(200).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
i.incrementAndGet();
log.info(s);
});
log.info("{}", i.intValue());
Assertions.assertTrue(i.intValue() > 0);
}
}

View File

@ -0,0 +1,54 @@
/**
*
*/
package eu.dnetlib.dhp.collection.plugin.rest;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.HttpClientParams;
/**
*
* @author js, Andreas Czerniak
* @date 2020-04-08
*/
public class RestIteratorTest {
private static final Logger log = LoggerFactory.getLogger(RestIteratorTest.class);
private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
private String resumptionType = "count";
private String resumptionParam = "from";
private String resumptionXpath = "";
private String resultTotalXpath = "//hits/total";
private String entityXpath = "//hits/hits";
private String resultFormatParam = "format";
private String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase
private String resultSizeParam = "size";
private String resultSizeValue = "10";
private String authMethod = "";
private String authToken = "";
private String resultOffsetParam = "cursor";
private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
@Disabled
@Test
public void test() {
HttpClientParams clientParams = new HttpClientParams();
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
query, entityXpath, authMethod, authToken, resultOffsetParam);
int i = 20;
while (iterator.hasNext() && i > 0) {
String result = iterator.next();
i--;
}
}
}

View File

@ -461,6 +461,12 @@
<version>${apache.poi.version}</version> <version>${apache.poi.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20180813</version>
</dependency>
<dependency> <dependency>
<groupId>org.json4s</groupId> <groupId>org.json4s</groupId>
<artifactId>json4s-jackson_2.11</artifactId> <artifactId>json4s-jackson_2.11</artifactId>