From acbe3119a4a5510bfd6ab887c6165e96947e6409 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 8 Mar 2021 09:44:09 +0100 Subject: [PATCH] RestCollectorPlugin imported from dne45 --- dhp-workflows/dhp-aggregation/pom.xml | 5 + .../dhp/collection/CollectorWorker.java | 3 + .../eu/dnetlib/dhp/collection/JsonUtils.java | 84 ++++ .../collection/plugin/CollectorPlugin.java | 2 +- .../plugin/rest/RestCollectorPlugin.java | 92 ++++ .../collection/plugin/rest/RestIterator.java | 442 ++++++++++++++++++ .../plugin/rest/RestCollectorPluginTest.java | 81 ++++ .../plugin/rest/RestIteratorTest.java | 54 +++ pom.xml | 6 + 9 files changed, 768 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 6887be55e..cfe9e74fd 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -86,6 +86,11 @@ jaxen + + org.json + json + + org.apache.commons diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index ef29cb5b1..f9d7d7dae 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -24,6 +24,7 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin; public class CollectorWorker extends ReportingJob { @@ -109,6 +110,8 @@ public class CollectorWorker extends ReportingJob { switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) { case oai: return new OaiCollectorPlugin(clientParams); + case rest_json2xml: + return new RestCollectorPlugin(clientParams); case other: final CollectorPlugin.NAME.OTHER_NAME plugin = Optional .ofNullable(api.getParams().get("other_plugin_type")) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java new file mode 100644 index 000000000..da3768a4a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class JsonUtils { + + private static final Log log = LogFactory.getLog(JsonUtils.class); + + public static final String wrapName = "recordWrap"; + + /** + * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' + * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names + * and work-around for the JSON to XML converting of org.json.XML-package. + * + * known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"], + * + * @param jsonInput + * @return convertedJsonKeynameOutput + */ + public String syntaxConvertJsonKeyNames(String jsonInput) { + + log.trace("before convertJsonKeyNames: " + jsonInput); + // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) + // replace ' 's in JSON Namens with '_' + while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); + } + + // replace forward-slash (sign '/' ) in JSON Names with '_' + while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); + } + + // replace '(' in JSON Names with '' + while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":"); + } + + // replace ')' in JSON Names with '' + while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); + } + + // add prefix of startNumbers in JSON Keynames with 'n_' + while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":"); + } + // add prefix of only numbers in JSON Keynames with 'm_' + while (jsonInput.matches(".*\"([0-9]+)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":"); + } + + // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with '' + while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":"); + } + + // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. + // while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) { + // jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":"); + // } + + // replace '=' in JSON Keynames with '-' + while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":"); + } + + log.trace("after syntaxConvertJsonKeyNames: " + jsonInput); + return jsonInput; + } + + public String convertToXML(final String jsonRecord) { + String resultXml = ""; + org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord)); + resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element + log.trace("before inputStream: " + resultXml); + resultXml = XmlCleaner.cleanAllEntities(resultXml); + log.trace("after cleaning: " + resultXml); + return resultXml; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 0ed6be5fa..457f63468 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -10,7 +10,7 @@ import eu.dnetlib.dhp.collection.CollectorException; public interface CollectorPlugin { enum NAME { - oai, other; + oai, other, rest_json2xml; public enum OTHER_NAME { mdstore_mongodb_dump, mdstore_mongodb diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java new file mode 100644 index 000000000..ad8bfa4ea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -0,0 +1,92 @@ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; + +/** + * TODO: delegate HTTP requests to the common HttpConnector2 implementation. + * + * @author js, Andreas Czerniak + * @date 2020-04-09 + * + */ +public class RestCollectorPlugin implements CollectorPlugin { + + private HttpClientParams clientParams; + + public RestCollectorPlugin(HttpClientParams clientParams) { + this.clientParams = clientParams; + } + + @Override + public Stream collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException { + final String baseUrl = api.getBaseUrl(); + final String resumptionType = api.getParams().get("resumptionType"); + final String resumptionParam = api.getParams().get("resumptionParam"); + final String resumptionXpath = api.getParams().get("resumptionXpath"); + final String resultTotalXpath = api.getParams().get("resultTotalXpath"); + final String resultFormatParam = api.getParams().get("resultFormatParam"); + final String resultFormatValue = api.getParams().get("resultFormatValue"); + final String resultSizeParam = api.getParams().get("resultSizeParam"); + final String resultSizeValue = (StringUtils.isBlank(api.getParams().get("resultSizeValue"))) ? "100" + : api.getParams().get("resultSizeValue"); + final String queryParams = api.getParams().get("queryParams"); + final String entityXpath = api.getParams().get("entityXpath"); + final String authMethod = api.getParams().get("authMethod"); + final String authToken = api.getParams().get("authToken"); + + if (StringUtils.isBlank(baseUrl)) { + throw new CollectorException("Param 'baseUrl' is null or empty"); + } + if (StringUtils.isBlank(resumptionType)) { + throw new CollectorException("Param 'resumptionType' is null or empty"); + } + if (StringUtils.isBlank(resumptionParam)) { + throw new CollectorException("Param 'resumptionParam' is null or empty"); + } + if (StringUtils.isBlank(resultFormatValue)) { + throw new CollectorException("Param 'resultFormatValue' is null or empty"); + } + if (StringUtils.isBlank(queryParams)) { + throw new CollectorException("Param 'queryParams' is null or empty"); + } + if (StringUtils.isBlank(entityXpath)) { + throw new CollectorException("Param 'entityXpath' is null or empty"); + } + + RestIterator it = new RestIterator( + getClientParams(), + baseUrl, + resumptionType, + resumptionParam, + resumptionXpath, + resultTotalXpath, + resultFormatParam, + resultFormatValue, + resultSizeParam, + resultSizeValue, + queryParams, + entityXpath, + authMethod, + authToken); + + return StreamSupport + .stream( + Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false); + } + + public HttpClientParams getClientParams() { + return clientParams; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java new file mode 100644 index 000000000..b728293d5 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -0,0 +1,442 @@ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.io.InputStream; +import java.io.StringWriter; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; +import java.util.Queue; +import java.util.concurrent.PriorityBlockingQueue; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.*; + +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.http.HttpHeaders; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.JsonUtils; + +/** + * log.debug(...) equal to log.trace(...) in the application-logs + *

+ * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue + * + * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak + * @date 2020-04-09 + * + */ +public class RestIterator implements Iterator { + + private static final Log log = LogFactory.getLog(RestIterator.class); + + private HttpClientParams clientParams; + + private final String BASIC = "basic"; + + private JsonUtils jsonUtils; + + private String baseUrl; + private String resumptionType; + private String resumptionParam; + private String resultFormatValue; + private String queryParams; + private int resultSizeValue; + private int resumptionInt = 0; // integer resumption token (first record to harvest) + private int resultTotal = -1; + private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest + // or token scanned from results) + private InputStream resultStream; + private Transformer transformer; + private XPath xpath; + private String query; + private XPathExpression xprResultTotalPath; + private XPathExpression xprResumptionPath; + private XPathExpression xprEntity; + private String queryFormat; + private String querySize; + private String authMethod; + private String authToken; + private final Queue recordQueue = new PriorityBlockingQueue(); + private int discoverResultSize = 0; + private int pagination = 1; + + /** + * RestIterator class + * + * compatible to version before 1.3.33 + * + * @param baseUrl + * @param resumptionType + * @param resumptionParam + * @param resumptionXpath + * @param resultTotalXpath + * @param resultFormatParam + * @param resultFormatValue + * @param resultSizeParam + * @param resultSizeValueStr + * @param queryParams + * @param entityXpath + */ + public RestIterator( + final HttpClientParams clientParams, + final String baseUrl, + final String resumptionType, + final String resumptionParam, + final String resumptionXpath, + final String resultTotalXpath, + final String resultFormatParam, + final String resultFormatValue, + final String resultSizeParam, + final String resultSizeValueStr, + final String queryParams, + final String entityXpath) { + this(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, + resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValueStr, queryParams, entityXpath, "", + ""); + } + + public RestIterator( + final HttpClientParams clientParams, + final String baseUrl, + final String resumptionType, + final String resumptionParam, + final String resumptionXpath, + final String resultTotalXpath, + final String resultFormatParam, + final String resultFormatValue, + final String resultSizeParam, + final String resultSizeValueStr, + final String queryParams, + final String entityXpath, + final String authMethod, + final String authToken, + final String resultOffsetParam) { + this(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, + resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValueStr, queryParams, entityXpath, "", + ""); + } + + /** RestIterator class + * compatible to version 1.3.33 + */ + public RestIterator( + final HttpClientParams clientParams, + final String baseUrl, + final String resumptionType, + final String resumptionParam, + final String resumptionXpath, + final String resultTotalXpath, + final String resultFormatParam, + final String resultFormatValue, + final String resultSizeParam, + final String resultSizeValueStr, + final String queryParams, + final String entityXpath, + final String authMethod, + final String authToken) { + this.clientParams = clientParams; + this.jsonUtils = new JsonUtils(); + this.baseUrl = baseUrl; + this.resumptionType = resumptionType; + this.resumptionParam = resumptionParam; + this.resultFormatValue = resultFormatValue; + this.queryParams = queryParams; + this.resultSizeValue = Integer.valueOf(resultSizeValueStr); + this.authMethod = authMethod; + this.authToken = authToken; + + queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue + : ""; + querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : ""; + + try { + initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath); + } catch (Exception e) { + throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); + } + initQueue(); + } + + private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) + throws TransformerConfigurationException, XPathExpressionException { + transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); + xpath = XPathFactory.newInstance().newXPath(); + xprResultTotalPath = xpath.compile(resultTotalXpath); + xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath); + xprEntity = xpath.compile(entityXpath); + } + + private void initQueue() { + query = baseUrl + "?" + queryParams + querySize + queryFormat; + } + + private void disconnect() { + // TODO close inputstream + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + @Override + public boolean hasNext() { + if (recordQueue.isEmpty() && query.isEmpty()) { + disconnect(); + return false; + } else { + return true; + } + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#next() + */ + @Override + public String next() { + synchronized (recordQueue) { + while (recordQueue.isEmpty() && !query.isEmpty()) { + try { + log.debug("get Query: " + query); + query = downloadPage(query); + log.debug("next queryURL from downloadPage(): " + query); + } catch (CollectorException e) { + log.debug("CollectorPlugin.next()-Exception: " + e); + throw new RuntimeException(e); + } + } + return recordQueue.poll(); + } + } + + /* + * download page and return nextQuery + */ + private String downloadPage(String query) throws CollectorException { + String resultJson; + String resultXml = ""; + String emptyXml = resultXml + "<" + JsonUtils.wrapName + ">"; + Node resultNode = null; + NodeList nodeList = null; + InputStream theHttpInputStream; + + // check if cursor=* is initial set otherwise add it to the queryParam URL + if (resumptionType.equalsIgnoreCase("deep-cursor")) { + log.debug("check resumptionType deep-cursor and check cursor=*?" + query); + if (!query.contains("&cursor=")) { + query += "&cursor=*"; + } + } + + try { + URL qUrl = new URL(query); + log.debug("authMethod :" + authMethod); + if (this.authMethod == "bearer") { + log.trace("authMethod before inputStream: " + resultXml); + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken); + conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/json"); + conn.setRequestMethod("GET"); + theHttpInputStream = conn.getInputStream(); + } else if (BASIC.equalsIgnoreCase(this.authMethod)) { + log.trace("authMethod before inputStream: " + resultXml); + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken); + conn.setRequestProperty(HttpHeaders.ACCEPT, "application/xml"); + conn.setRequestMethod("GET"); + theHttpInputStream = conn.getInputStream(); + } else { + theHttpInputStream = qUrl.openStream(); + } + + resultStream = theHttpInputStream; + if ("json".equalsIgnoreCase(resultFormatValue)) { + resultJson = IOUtils.toString(resultStream, "UTF-8"); + resultXml = jsonUtils.convertToXML(resultJson); + resultStream = IOUtils.toInputStream(resultXml, "UTF-8"); + } + + if (!(emptyXml).equalsIgnoreCase(resultXml)) { + resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE); + nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET); + log.debug("nodeList.length: " + nodeList.getLength()); + for (int i = 0; i < nodeList.getLength(); i++) { + StringWriter sw = new StringWriter(); + transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); + recordQueue.add(sw.toString()); + } + } else { + log.info("resultXml is equal with emptyXml"); + } + + resumptionInt += resultSizeValue; + + String qUrlArgument = ""; + switch (resumptionType.toLowerCase()) { + case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items + resumptionStr = xprResumptionPath.evaluate(resultNode); + break; + + case "count": // begin at one step for all records, iterate over items + resumptionStr = Integer.toString(resumptionInt); + break; + + case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808) + if (resultSizeValue < 2) { + throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2"); + } + qUrlArgument = qUrl.getQuery(); + String[] arrayQUrlArgument = qUrlArgument.split("&"); + int urlOldResumptionSize = 0; + for (String arrayUrlArgStr : arrayQUrlArgument) { + if (arrayUrlArgStr.startsWith(resumptionParam)) { + String[] resumptionKeyValue = arrayUrlArgStr.split("="); + if (isInteger(resumptionKeyValue[1])) { + urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]); + log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize); + } else { + log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]); + } + } + } + + if (((emptyXml).equalsIgnoreCase(resultXml)) + || ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) { + // resumptionStr = ""; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } + resultTotal = discoverResultSize; + } else { + resumptionStr = Integer.toString(resumptionInt); + resultTotal = resumptionInt + 1; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } + } + log.debug("discoverResultSize: " + discoverResultSize); + break; + + case "pagination": + case "page": // pagination, iterate over page numbers + pagination += 1; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } else { + resultTotal = discoverResultSize; + pagination = discoverResultSize; + } + resumptionInt = pagination; + resumptionStr = Integer.toString(resumptionInt); + break; + + case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in + // solr) + // isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: + // deep-cursor, Param 'resultSizeValue' is less than 2");} + + resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode)); + queryParams = queryParams.replace("&cursor=*", ""); + + // terminating if length of nodeList is 0 + if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) { + resumptionInt += (nodeList.getLength() + 1 - resultSizeValue); + } else { + resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue + // because the iteration is over + // real length and the + // resultSizeValue is added before + // the switch() + } + + discoverResultSize = nodeList.getLength(); + + log + .debug( + "downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + + queryParams + " resumptionLengthIncreased: " + resumptionInt); + + break; + + default: // otherwise: abort + // resultTotal = resumptionInt; + break; + } + + } catch (Exception e) { + log.error(e); + throw new IllegalStateException("collection failed: " + e.getMessage()); + } + + try { + if (resultTotal == -1) { + resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); + if (resumptionType.toLowerCase().equals("page") && !BASIC.equalsIgnoreCase(authMethod)) { + resultTotal += 1; + } // to correct the upper bound + log.info("resultTotal was -1 is now: " + resultTotal); + } + } catch (Exception e) { + log.error(e); + throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage()); + } + log.debug("resultTotal: " + resultTotal); + log.debug("resInt: " + resumptionInt); + String nextQuery; + if (resumptionInt <= resultTotal) { + nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + + queryFormat; + } else { + nextQuery = ""; + // if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the + // resumptionInt and prevent a NullPointer Exception at mdStore + } + log.debug("nextQueryUrl: " + nextQuery); + return nextQuery; + } + + private boolean isInteger(String s) { + boolean isValidInteger = false; + try { + Integer.parseInt(s); + + // s is a valid integer + + isValidInteger = true; + } catch (NumberFormatException ex) { + // s is not an integer + } + + return isValidInteger; + } + + // Method to encode a string value using `UTF-8` encoding scheme + private String encodeValue(String value) { + try { + return URLEncoder.encode(value, StandardCharsets.UTF_8.toString()); + } catch (UnsupportedEncodingException ex) { + throw new RuntimeException(ex.getCause()); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java new file mode 100644 index 000000000..648ac85fb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java @@ -0,0 +1,81 @@ +/** + * + */ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.util.HashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; + +import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; + +/** + * @author js, Andreas Czerniak + * + */ +public class RestCollectorPluginTest { + + private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class); + + private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; + private String resumptionType = "count"; + private String resumptionParam = "from"; + private String entityXpath = "//hits/hits"; + private String resumptionXpath = "//hits"; + private String resultTotalXpath = "//hits/total"; + private String resultFormatParam = "format"; + private String resultFormatValue = "json"; + private String resultSizeParam = "size"; + private String resultSizeValue = "10"; + // private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; + private String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29"; + // private String query = "=(sources:engrXiv AND type:preprint)"; + + private String protocolDescriptor = "rest_json2xml"; + private ApiDescriptor api = new ApiDescriptor(); + private RestCollectorPlugin rcp; + + @BeforeEach + public void setUp() { + HashMap params = new HashMap<>(); + params.put("resumptionType", resumptionType); + params.put("resumptionParam", resumptionParam); + params.put("resumptionXpath", resumptionXpath); + params.put("resultTotalXpath", resultTotalXpath); + params.put("resultFormatParam", resultFormatParam); + params.put("resultFormatValue", resultFormatValue); + params.put("resultSizeParam", resultSizeParam); + params.put("resultSizeValue", resultSizeValue); + params.put("queryParams", query); + params.put("entityXpath", entityXpath); + + api.setBaseUrl(baseUrl); + api.setParams(params); + + rcp = new RestCollectorPlugin(new HttpClientParams()); + } + + @Disabled + @Test + public void test() throws CollectorException { + AtomicInteger i = new AtomicInteger(0); + final Stream stream = rcp.collect(api, new AggregatorReport()); + + stream.limit(200).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + i.incrementAndGet(); + log.info(s); + }); + + log.info("{}", i.intValue()); + Assertions.assertTrue(i.intValue() > 0); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java new file mode 100644 index 000000000..16604e0eb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java @@ -0,0 +1,54 @@ +/** + * + */ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.HttpClientParams; + +/** + * + * @author js, Andreas Czerniak + * @date 2020-04-08 + */ +public class RestIteratorTest { + + private static final Logger log = LoggerFactory.getLogger(RestIteratorTest.class); + + private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; + private String resumptionType = "count"; + private String resumptionParam = "from"; + private String resumptionXpath = ""; + private String resultTotalXpath = "//hits/total"; + private String entityXpath = "//hits/hits"; + private String resultFormatParam = "format"; + private String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase + private String resultSizeParam = "size"; + private String resultSizeValue = "10"; + private String authMethod = ""; + private String authToken = ""; + private String resultOffsetParam = "cursor"; + private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; + + @Disabled + @Test + public void test() { + + HttpClientParams clientParams = new HttpClientParams(); + + final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam, + resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, + query, entityXpath, authMethod, authToken, resultOffsetParam); + int i = 20; + while (iterator.hasNext() && i > 0) { + String result = iterator.next(); + + i--; + } + } +} diff --git a/pom.xml b/pom.xml index 45bb6bf78..5c45fad5f 100644 --- a/pom.xml +++ b/pom.xml @@ -461,6 +461,12 @@ ${apache.poi.version} + + org.json + json + 20180813 + + org.json4s json4s-jackson_2.11