Fixes for pagination strategy looping at end of download

This commit is contained in:
Giambattista Bloisi 2024-06-10 19:03:30 +02:00
parent b0eba210c0
commit 85c1eae7e0
1 changed files with 40 additions and 18 deletions

View File

@ -12,6 +12,8 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
@ -19,16 +21,10 @@ import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import javax.xml.xpath.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Node;
@ -51,7 +47,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
*
*/
public class RestIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
public static final String UTF_8 = "UTF-8";
private static final int MAX_ATTEMPTS = 5;
@ -60,11 +55,15 @@ public class RestIterator implements Iterator<String> {
private final String AUTHBASIC = "basic";
private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG
+ ">";
private final String baseUrl;
private final String resumptionType;
private final String resumptionParam;
private final String resultFormatValue;
private String queryParams;
private String queryParams = "";
private final int resultSizeValue;
private int resumptionInt = 0; // integer resumption token (first record to harvest)
private int resultTotal = -1;
@ -158,7 +157,12 @@ public class RestIterator implements Iterator<String> {
}
private void initQueue() {
this.query = this.baseUrl + "?" + this.queryParams + this.querySize + this.queryFormat;
if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
query = baseUrl;
} else {
query = baseUrl + "?" + queryParams + querySize + queryFormat;
}
log.info("REST calls starting with {}", this.query);
}
@ -219,9 +223,8 @@ public class RestIterator implements Iterator<String> {
try {
String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String resultXml = XML_HEADER;
String nextQuery = "";
final String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
Node resultNode = null;
NodeList nodeList = null;
String qUrlArgument = "";
@ -236,6 +239,21 @@ public class RestIterator implements Iterator<String> {
}
}
// find pagination page start number in queryParam and remove before start the first query
if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page"))
&& (query.contains("paginationStart="))) {
final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
m.find(); // guaranteed to be true for this regex
String[] pageVal = m.group(0).split("=");
pagination = Integer.parseInt(pageVal[1]);
// remove page start number from queryParams
query = query.replaceFirst("&?paginationStart=[0-9]+", "");
}
try {
log.info("requesting URL [{}]", query);
@ -261,7 +279,7 @@ public class RestIterator implements Iterator<String> {
this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
}
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
if (!isEmptyXml(resultXml)) {
resultNode = (Node) this.xpath
.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
@ -270,8 +288,7 @@ public class RestIterator implements Iterator<String> {
final StringWriter sw = new StringWriter();
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
final String toEnqueue = sw.toString();
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue)
|| emptyXml.equalsIgnoreCase(toEnqueue)) {
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
log
.warn(
"The following record resulted in empty item for the feeding queue: {}", resultXml);
@ -299,6 +316,7 @@ public class RestIterator implements Iterator<String> {
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
}
qUrlArgument = qUrl.getQuery();
final String[] arrayQUrlArgument = qUrlArgument.split("&");
for (final String arrayUrlArgStr : arrayQUrlArgument) {
if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
@ -312,7 +330,7 @@ public class RestIterator implements Iterator<String> {
}
}
if (((emptyXml).equalsIgnoreCase(resultXml))
if (isEmptyXml(resultXml)
|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
// resumptionStr = "";
if (nodeList != null) {
@ -331,13 +349,13 @@ public class RestIterator implements Iterator<String> {
case "pagination":
case "page": // pagination, iterate over page numbers
this.pagination += 1;
if (nodeList != null) {
if (nodeList != null && nodeList.getLength() > 0) {
this.discoverResultSize += nodeList.getLength();
} else {
this.resultTotal = this.discoverResultSize;
this.pagination = this.discoverResultSize;
}
this.pagination += 1;
this.resumptionInt = this.pagination;
this.resumptionStr = Integer.toString(this.resumptionInt);
break;
@ -415,6 +433,10 @@ public class RestIterator implements Iterator<String> {
}
private boolean isEmptyXml(String s) {
return EMPTY_XML.equalsIgnoreCase(s);
}
private boolean isInteger(final String s) {
boolean isValidInteger = false;
try {