/** * log.debug(...) equal to log.trace(...) in the application-logs *

* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue */ package eu.dnetlib.data.collector.plugins.rest; import java.io.InputStream; import java.io.StringWriter; import java.net.URL; import java.util.Iterator; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.*; import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; import eu.dnetlib.data.collector.rmi.CollectorServiceException; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; /** * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak * @date 2018-09-03 * */ public class RestIterator implements Iterator { // TODO: clean up the comments of replaced source code private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM private static final String wrapName = "recordWrap"; private String baseUrl; private String resumptionType; private String resumptionParam; private String resultFormatValue; private String queryParams; private int resultSizeValue; private int resumptionInt = 0; // integer resumption token (first record to harvest) private int resultTotal = -1; private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest or token scanned from results) private InputStream resultStream; private Transformer transformer; private XPath xpath; private String query; private XPathExpression xprResultTotalPath; private XPathExpression xprResumptionPath; private XPathExpression xprEntity; private String queryFormat; private String querySize; private Queue recordQueue = new PriorityBlockingQueue(); private int discoverResultSize = 0; private int pagination = 1; public RestIterator( final String baseUrl, final String resumptionType, final String resumptionParam, final String resumptionXpath, final String resultTotalXpath, final String resultFormatParam, final String resultFormatValue, final String resultSizeParam, final String resultSizeValueStr, final String queryParams, final String entityXpath ) { this.baseUrl = baseUrl; this.resumptionType = resumptionType; this.resumptionParam = resumptionParam; this.resultFormatValue = resultFormatValue; this.queryParams = queryParams; this.resultSizeValue = Integer.valueOf(resultSizeValueStr); queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : ""; querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : ""; try { initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath); } catch (Exception e) { throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); } initQueue(); } private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException { transformer = TransformerFactory.newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); xpath = XPathFactory.newInstance().newXPath(); xprResultTotalPath = xpath.compile(resultTotalXpath); xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath); xprEntity = xpath.compile(entityXpath); } private void initQueue() { query = baseUrl + "?" + queryParams + querySize + queryFormat; } private void disconnect() { // TODO close inputstream } /* (non-Javadoc) * @see java.util.Iterator#hasNext() */ @Override public boolean hasNext() { if (recordQueue.isEmpty() && query.isEmpty()) { disconnect(); return false; } else { return true; } } /* (non-Javadoc) * @see java.util.Iterator#next() */ @Override public String next() { synchronized (recordQueue) { while (recordQueue.isEmpty() && !query.isEmpty()) { try { log.info("get Query: " + query); query = downloadPage(query); log.debug("next queryURL from downloadPage(): " + query); } catch (CollectorServiceException e) { log.debug("CollectorPlugin.next()-Exception: " + e); throw new RuntimeException(e); } } return recordQueue.poll(); } } /* * download page and return nextQuery */ private String downloadPage(String query) throws CollectorServiceException { String resultJson; String resultXml = ""; String nextQuery = ""; String emptyXml = resultXml + "<" + wrapName + ">"; Node resultNode = null; NodeList nodeList = null; String qUrlArgument = ""; int urlOldResumptionSize = 0; try { URL qUrl = new URL(query); resultStream = qUrl.openStream(); if ("json".equals(resultFormatValue.toLowerCase())) { resultJson = IOUtils.toString(resultStream, "UTF-8"); resultJson = syntaxConvertJsonKeyNamens(resultJson); org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson); resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element log.trace("before inputStream: " + resultXml); resultXml = XmlCleaner.cleanAllEntities(resultXml); log.trace("after cleaning: " + resultXml); resultStream = IOUtils.toInputStream(resultXml, "UTF-8"); } if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) { resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE); nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET); log.debug("nodeList.length: " + nodeList.getLength()); for (int i = 0; i < nodeList.getLength(); i++) { StringWriter sw = new StringWriter(); transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); recordQueue.add(sw.toString()); } } else { log.info("resultXml is equal with emptyXml"); } resumptionInt += resultSizeValue; switch (resumptionType.toLowerCase()) { case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items resumptionStr = xprResumptionPath.evaluate(resultNode); break; case "count": // begin at one step for all records, iterate over items resumptionStr = Integer.toString(resumptionInt); break; case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808) if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");} qUrlArgument = qUrl.getQuery(); String[] arrayQUrlArgument = qUrlArgument.split("&"); for (String arrayUrlArgStr : arrayQUrlArgument) { if (arrayUrlArgStr.startsWith(resumptionParam)) { String[] resumptionKeyValue = arrayUrlArgStr.split("="); urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]); log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize); } } if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) || ((nodeList != null) && (nodeList.getLength() < resultSizeValue)) ) { // resumptionStr = ""; if (nodeList != null) { discoverResultSize += nodeList.getLength(); } resultTotal = discoverResultSize; } else { resumptionStr = Integer.toString(resumptionInt); resultTotal = resumptionInt + 1; if (nodeList != null) { discoverResultSize += nodeList.getLength(); } } log.info("discoverResultSize: " + discoverResultSize); break; case "pagination": case "page": // pagination, iterate over pages pagination += 1; if (nodeList != null) { discoverResultSize += nodeList.getLength(); } else { resultTotal = discoverResultSize; pagination = discoverResultSize; } resumptionInt = pagination; resumptionStr = Integer.toString(resumptionInt); break; default: // otherwise: abort // resultTotal = resumptionInt; break; } if (resultTotal == -1) { resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; } // to correct the upper bound log.info("resultTotal was -1 is now: " + resultTotal); } log.info("resultTotal: " + resultTotal); log.info("resInt: " + resumptionInt); if (resumptionInt < resultTotal) { nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat; } else nextQuery = ""; log.debug("nextQueryUrl: " + nextQuery); return nextQuery; } catch (Exception e) { log.error(e); throw new IllegalStateException("collection failed: " + e.getMessage()); } } /** * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names * and work-around for the JSON to XML converting of org.json.XML-package. * * known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"], * * @param jsonInput * @return convertedJsonKeynameOutput */ private String syntaxConvertJsonKeyNamens(String jsonInput) { log.trace("before convertJsonKeyNames: " + jsonInput); // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) // replace ' 's in JSON Namens with '_' while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); } // replace forward-slash (sign '/' ) in JSON Names with '_' while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) { jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); } // replace '(' in JSON Names with '' while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) { jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":"); } // replace ')' in JSON Names with '' while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); } // replace startNumbers in JSON Keynames with 'n_' while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) { jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":"); } // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with '' while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) { jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":"); } // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. // while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) { // jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":"); // } // replace '=' in JSON Keynames with '-' while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) { jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":"); } log.trace("after syntaxConvertJsonKeyNames: " + jsonInput); return jsonInput; } /** * * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities * * * @param bufferStr - XML string * @return */ private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) { while (bufferStr.matches(".*<([^<>].*),(.)>.*")) { bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>"); } // replace [#x10-#x1f] with '' // while (bufferStr.matches(".*[0-9a-f].*")) { // bufferStr = bufferStr.replaceAll("([0-9a-fA-F])", ""); // } return bufferStr; } }