From 52bb7af03b87c8d40f4b304b7920b59720ddceba Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 19 Sep 2024 14:59:05 +0200 Subject: [PATCH] use of dom4j --- .../osf/OsfPreprintsCollectorPlugin.java | 4 - .../plugin/osf/OsfPreprintsIterator.java | 151 ++++-------------- .../osf/OsfPreprintsCollectorPluginTest.java | 5 - 3 files changed, 27 insertions(+), 133 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsCollectorPlugin.java index bd628efb85..fdc9df06f9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsCollectorPlugin.java @@ -10,8 +10,6 @@ import java.util.stream.StreamSupport; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; -import com.google.gson.Gson; - import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; @@ -32,8 +30,6 @@ public class OsfPreprintsCollectorPlugin implements CollectorPlugin { public Stream collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException { final String baseUrl = api.getBaseUrl(); - final Gson gson = new Gson(); - final int pageSize = Optional .ofNullable(api.getParams().get("pageSize")) .filter(StringUtils::isNotBlank) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsIterator.java index a3b1dc2379..311daf859a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsIterator.java @@ -1,30 +1,15 @@ package eu.dnetlib.dhp.collection.plugin.osf; -import java.io.InputStream; -import java.io.StringWriter; -import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpression; -import javax.xml.xpath.XPathFactory; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.DocumentHelper; +import org.dom4j.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.InputSource; import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils; import eu.dnetlib.dhp.common.collection.CollectorException; @@ -39,24 +24,11 @@ public class OsfPreprintsIterator implements Iterator { private final HttpClientParams clientParams; - private static final String XML_HEADER = ""; - private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + ">"; - private final String baseUrl; private final int pageSize; - private int resumptionInt = 0; // integer resumption token (first record to harvest) - private int resultTotal = -1; - private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to - // harvest - // or token scanned from results) - private InputStream resultStream; - private Transformer transformer; - private XPath xpath; - private String query; - private XPathExpression xprResultTotalPath; - private XPathExpression xprResumptionPath; - private XPathExpression xprEntity; + private String currentUrl; + private final Queue recordQueue = new PriorityBlockingQueue<>(); public OsfPreprintsIterator( @@ -68,38 +40,20 @@ public class OsfPreprintsIterator implements Iterator { this.baseUrl = baseUrl; this.pageSize = pageSize; - try { - final TransformerFactory factory = TransformerFactory.newInstance(); - this.transformer = factory.newTransformer(); - this.transformer.setOutputProperty(OutputKeys.INDENT, "yes"); - this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); - this.xpath = XPathFactory.newInstance().newXPath(); - this.xprResultTotalPath = this.xpath.compile("/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']"); - this.xprResumptionPath = this.xpath.compile("substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')"); - this.xprEntity = this.xpath.compile("/*/*[local-name()='data']"); - } catch (final Exception e) { - throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); - } - initQueue(); } private void initQueue() { - this.query = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize; - log.info("REST calls starting with {}", this.query); + this.currentUrl = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize; + log.info("REST calls starting with {}", this.currentUrl); } - /* - * (non-Javadoc) - * - * @see java.util.Iterator#hasNext() - */ @Override public boolean hasNext() { synchronized (this.recordQueue) { - while (this.recordQueue.isEmpty() && !this.query.isEmpty()) { + while (this.recordQueue.isEmpty() && !this.currentUrl.isEmpty()) { try { - this.query = downloadPage(this.query, 0); + this.currentUrl = downloadPage(this.currentUrl, 0); } catch (final CollectorException e) { log.debug("CollectorPlugin.next()-Exception: {}", e); throw new RuntimeException(e); @@ -112,11 +66,6 @@ public class OsfPreprintsIterator implements Iterator { } } - /* - * (non-Javadoc) - * - * @see java.util.Iterator#next() - */ @Override public String next() { synchronized (this.recordQueue) { @@ -124,12 +73,9 @@ public class OsfPreprintsIterator implements Iterator { } } - /* - * download page and return nextQuery (with number of attempt) - */ - private String downloadPage(final String query, final int attempt) throws CollectorException { + private String downloadPage(final String url, final int attempt) throws CollectorException { - if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, query:" + query); } + if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); } if (attempt > 0) { final int delay = (attempt * 5000); @@ -142,78 +88,35 @@ public class OsfPreprintsIterator implements Iterator { } try { - String resultJson; - String resultXml = XML_HEADER; - String nextQuery = ""; - Node resultNode = null; - NodeList nodeList = null; + log.info("requesting URL [{}]", url); - try { - log.info("requesting URL [{}]", query); + final HttpConnector2 connector = new HttpConnector2(this.clientParams); - final HttpConnector2 connector = new HttpConnector2(this.clientParams); + final String json = connector.getInputSource(url); + final String xml = JsonUtils.convertToXML(json); - resultJson = connector.getInputSource(query); - resultXml = JsonUtils.convertToXML(resultJson); + final Document doc = DocumentHelper.parseText(xml); - this.resultStream = IOUtils.toInputStream(resultXml, StandardCharsets.UTF_8); + for (final Object o : doc.selectNodes("/*/*[local-name()='data']")) { + final Element n = (Element) ((Element) o).detach(); - if (!isEmptyXml(resultXml)) { - resultNode = (Node) this.xpath - .evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE); - nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET); - log.debug("nodeList.length: {}", nodeList.getLength()); - for (int i = 0; i < nodeList.getLength(); i++) { - final StringWriter sw = new StringWriter(); - this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); - final String toEnqueue = sw.toString(); - if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) { - log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml); - } else { - this.recordQueue.add(sw.toString()); - } - } - } else { - log.warn("resultXml is equal with emptyXml"); + for (final Object o1 : n.selectNodes(".//contributors//href")) { + // TODO ADD creators + } + for (final Object o1 : n.selectNodes(".//primary_file//href")) { + // TODO ADD fulltexts } - this.resumptionInt += this.pageSize; - - this.resumptionStr = this.xprResumptionPath.evaluate(resultNode); - - } catch (final Exception e) { - log.error(e.getMessage(), e); - throw new IllegalStateException("collection failed: " + e.getMessage()); + this.recordQueue.add(DocumentHelper.createDocument(n).asXML()); } - try { - if (this.resultTotal == -1) { - this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode)); - log.info("resultTotal was -1 is now: " + this.resultTotal); - } - } catch (final Exception e) { - log.error(e.getMessage(), e); - throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage()); - } - log.debug("resultTotal: " + this.resultTotal); - log.debug("resInt: " + this.resumptionInt); - if (this.resumptionInt <= this.resultTotal) { - nextQuery = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize + "&page=" - + this.resumptionStr; - } else { - nextQuery = ""; - } - log.debug("nextQueryUrl: " + nextQuery); - return nextQuery; + return doc.valueOf("/*/*[local-name()='links']/*[local-name()='next']"); + } catch (final Throwable e) { log.warn(e.getMessage(), e); - return downloadPage(query, attempt + 1); + return downloadPage(url, attempt + 1); } } - private boolean isEmptyXml(final String s) { - return EMPTY_XML.equalsIgnoreCase(s); - } - } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsCollectorPluginTest.java index f6ddb3aebf..10b93ea76e 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsCollectorPluginTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/osf/OsfPreprintsCollectorPluginTest.java @@ -24,11 +24,6 @@ public class OsfPreprintsCollectorPluginTest { private final String baseUrl = "https://api.osf.io/v2/preprints/"; - // private final String requestHeaderMap = ""; - // private final String authMethod = ""; - // private final String authToken = ""; - // private final String resultOutputFormat = ""; - private final int pageSize = 100; private final ApiDescriptor api = new ApiDescriptor();