dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestIterator.java

344 lines
12 KiB
Java

/**
* log.debug(...) equal to log.trace(...) in the application-logs
* <p>
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
*/
package eu.dnetlib.data.collector.plugins.rest;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URL;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
/**
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
* @date 2018-09-03
*
*/
public class RestIterator implements Iterator<String> {
// TODO: clean up the comments of replaced source code
private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
private static final String wrapName = "recordWrap";
private String baseUrl;
private String resumptionType;
private String resumptionParam;
private String resultFormatValue;
private String queryParams;
private int resultSizeValue;
private int resumptionInt = 0; // integer resumption token (first record to harvest)
private int resultTotal = -1;
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest or token scanned from results)
private InputStream resultStream;
private Transformer transformer;
private XPath xpath;
private String query;
private XPathExpression xprResultTotalPath;
private XPathExpression xprResumptionPath;
private XPathExpression xprEntity;
private String queryFormat;
private String querySize;
private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
private int discoverResultSize = 0;
private int pagination = 1;
public RestIterator(
final String baseUrl,
final String resumptionType,
final String resumptionParam,
final String resumptionXpath,
final String resultTotalXpath,
final String resultFormatParam,
final String resultFormatValue,
final String resultSizeParam,
final String resultSizeValueStr,
final String queryParams,
final String entityXpath
) {
this.baseUrl = baseUrl;
this.resumptionType = resumptionType;
this.resumptionParam = resumptionParam;
this.resultFormatValue = resultFormatValue;
this.queryParams = queryParams;
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
try {
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
} catch (Exception e) {
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
}
initQueue();
}
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
throws TransformerConfigurationException, XPathExpressionException {
transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
xpath = XPathFactory.newInstance().newXPath();
xprResultTotalPath = xpath.compile(resultTotalXpath);
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
xprEntity = xpath.compile(entityXpath);
}
private void initQueue() {
query = baseUrl + "?" + queryParams + querySize + queryFormat;
}
private void disconnect() {
// TODO close inputstream
}
/* (non-Javadoc)
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
if (recordQueue.isEmpty() && query.isEmpty()) {
disconnect();
return false;
} else {
return true;
}
}
/* (non-Javadoc)
* @see java.util.Iterator#next()
*/
@Override
public String next() {
synchronized (recordQueue) {
while (recordQueue.isEmpty() && !query.isEmpty()) {
try {
log.info("get Query: " + query);
query = downloadPage(query);
log.debug("next queryURL from downloadPage(): " + query);
} catch (CollectorServiceException e) {
log.debug("CollectorPlugin.next()-Exception: " + e);
throw new RuntimeException(e);
}
}
return recordQueue.poll();
}
}
/*
* download page and return nextQuery
*/
private String downloadPage(String query) throws CollectorServiceException {
String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String nextQuery = "";
String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
Node resultNode = null;
NodeList nodeList = null;
String qUrlArgument = "";
int urlOldResumptionSize = 0;
try {
URL qUrl = new URL(query);
resultStream = qUrl.openStream();
if ("json".equals(resultFormatValue.toLowerCase())) {
resultJson = IOUtils.toString(resultStream, "UTF-8");
resultJson = syntaxConvertJsonKeyNamens(resultJson);
org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
log.trace("before inputStream: " + resultXml);
resultXml = XmlCleaner.cleanAllEntities(resultXml);
log.trace("after cleaning: " + resultXml);
resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
}
if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
log.debug("nodeList.length: " + nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
StringWriter sw = new StringWriter();
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
recordQueue.add(sw.toString());
}
} else { log.info("resultXml is equal with emptyXml"); }
resumptionInt += resultSizeValue;
switch (resumptionType.toLowerCase()) {
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
resumptionStr = xprResumptionPath.evaluate(resultNode);
break;
case "count": // begin at one step for all records, iterate over items
resumptionStr = Integer.toString(resumptionInt);
break;
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
qUrlArgument = qUrl.getQuery();
String[] arrayQUrlArgument = qUrlArgument.split("&");
for (String arrayUrlArgStr : arrayQUrlArgument) {
if (arrayUrlArgStr.startsWith(resumptionParam)) {
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
}
}
if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
) {
// resumptionStr = "";
if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
resultTotal = discoverResultSize;
} else {
resumptionStr = Integer.toString(resumptionInt);
resultTotal = resumptionInt + 1;
if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
}
log.info("discoverResultSize: " + discoverResultSize);
break;
case "pagination":
case "page": // pagination, iterate over pages
pagination += 1;
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
} else {
resultTotal = discoverResultSize;
pagination = discoverResultSize;
}
resumptionInt = pagination;
resumptionStr = Integer.toString(resumptionInt);
break;
default: // otherwise: abort
// resultTotal = resumptionInt;
break;
}
if (resultTotal == -1) {
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; } // to correct the upper bound
log.info("resultTotal was -1 is now: " + resultTotal);
}
log.info("resultTotal: " + resultTotal);
log.info("resInt: " + resumptionInt);
if (resumptionInt < resultTotal) {
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
} else
nextQuery = "";
log.debug("nextQueryUrl: " + nextQuery);
return nextQuery;
} catch (Exception e) {
log.error(e);
throw new IllegalStateException("collection failed: " + e.getMessage());
}
}
/**
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
* and work-around for the JSON to XML converting of org.json.XML-package.
*
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
*
* @param jsonInput
* @return convertedJsonKeynameOutput
*/
private String syntaxConvertJsonKeyNamens(String jsonInput) {
log.trace("before convertJsonKeyNames: " + jsonInput);
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
// replace ' 's in JSON Namens with '_'
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
}
// replace forward-slash (sign '/' ) in JSON Names with '_'
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
}
// replace '(' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
}
// replace ')' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
}
// replace startNumbers in JSON Keynames with 'n_'
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
}
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
}
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
// }
// replace '=' in JSON Keynames with '-'
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
}
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
return jsonInput;
}
/**
*
* https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
* *
* @param bufferStr - XML string
* @return
*/
private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
}
// replace [#x10-#x1f] with ''
// while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
// bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
// }
return bufferStr;
}
}