From 23477f3e80419add8adadac70d9a17c41a8c3d8f Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 10 Jun 2024 19:03:30 +0200 Subject: [PATCH] Fixes for pagination strategy looping at end of download --- .../collection/plugin/rest/RestIterator.java | 58 +++++++++++++------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index 2518fd92f..9037a454e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -12,6 +12,8 @@ import java.util.Iterator; import java.util.Map; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; @@ -19,16 +21,10 @@ import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpression; -import javax.xml.xpath.XPathExpressionException; -import javax.xml.xpath.XPathFactory; +import javax.xml.xpath.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.http.HttpHeaders; -import org.apache.http.entity.ContentType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Node; @@ -51,7 +47,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams; * */ public class RestIterator implements Iterator { - private static final Logger log = LoggerFactory.getLogger(RestIterator.class); public static final String UTF_8 = "UTF-8"; private static final int MAX_ATTEMPTS = 5; @@ -60,11 +55,15 @@ public class RestIterator implements Iterator { private final String AUTHBASIC = "basic"; + private static final String XML_HEADER = ""; + private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + ">"; + private final String baseUrl; private final String resumptionType; private final String resumptionParam; private final String resultFormatValue; - private String queryParams; + private String queryParams = ""; private final int resultSizeValue; private int resumptionInt = 0; // integer resumption token (first record to harvest) private int resultTotal = -1; @@ -158,7 +157,12 @@ public class RestIterator implements Iterator { } private void initQueue() { - this.query = this.baseUrl + "?" + this.queryParams + this.querySize + this.queryFormat; + if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) { + query = baseUrl; + } else { + query = baseUrl + "?" + queryParams + querySize + queryFormat; + } + log.info("REST calls starting with {}", this.query); } @@ -219,9 +223,8 @@ public class RestIterator implements Iterator { try { String resultJson; - String resultXml = ""; + String resultXml = XML_HEADER; String nextQuery = ""; - final String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + ">"; Node resultNode = null; NodeList nodeList = null; String qUrlArgument = ""; @@ -236,6 +239,21 @@ public class RestIterator implements Iterator { } } + // find pagination page start number in queryParam and remove before start the first query + if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page")) + && (query.contains("paginationStart="))) { + + final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query); + m.find(); // guaranteed to be true for this regex + + String[] pageVal = m.group(0).split("="); + pagination = Integer.parseInt(pageVal[1]); + + // remove page start number from queryParams + query = query.replaceFirst("&?paginationStart=[0-9]+", ""); + + } + try { log.info("requesting URL [{}]", query); @@ -261,7 +279,7 @@ public class RestIterator implements Iterator { this.resultStream = IOUtils.toInputStream(resultXml, UTF_8); } - if (!(emptyXml).equalsIgnoreCase(resultXml)) { + if (!isEmptyXml(resultXml)) { resultNode = (Node) this.xpath .evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE); nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET); @@ -270,8 +288,7 @@ public class RestIterator implements Iterator { final StringWriter sw = new StringWriter(); this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); final String toEnqueue = sw.toString(); - if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) - || emptyXml.equalsIgnoreCase(toEnqueue)) { + if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) { log .warn( "The following record resulted in empty item for the feeding queue: {}", resultXml); @@ -299,6 +316,7 @@ public class RestIterator implements Iterator { throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2"); } qUrlArgument = qUrl.getQuery(); + final String[] arrayQUrlArgument = qUrlArgument.split("&"); for (final String arrayUrlArgStr : arrayQUrlArgument) { if (arrayUrlArgStr.startsWith(this.resumptionParam)) { @@ -312,7 +330,7 @@ public class RestIterator implements Iterator { } } - if (((emptyXml).equalsIgnoreCase(resultXml)) + if (isEmptyXml(resultXml) || ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) { // resumptionStr = ""; if (nodeList != null) { @@ -331,13 +349,13 @@ public class RestIterator implements Iterator { case "pagination": case "page": // pagination, iterate over page numbers - this.pagination += 1; - if (nodeList != null) { + if (nodeList != null && nodeList.getLength() > 0) { this.discoverResultSize += nodeList.getLength(); } else { this.resultTotal = this.discoverResultSize; this.pagination = this.discoverResultSize; } + this.pagination += 1; this.resumptionInt = this.pagination; this.resumptionStr = Integer.toString(this.resumptionInt); break; @@ -415,6 +433,10 @@ public class RestIterator implements Iterator { } + private boolean isEmptyXml(String s) { + return EMPTY_XML.equalsIgnoreCase(s); + } + private boolean isInteger(final String s) { boolean isValidInteger = false; try {