1
0
Fork 0

use of dom4j

This commit is contained in:
Michele Artini 2024-09-19 14:59:05 +02:00
parent 9073b1159d
commit 52bb7af03b
3 changed files with 27 additions and 133 deletions

View File

@ -10,8 +10,6 @@ import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import com.google.gson.Gson;
import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@ -32,8 +30,6 @@ public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException { public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
final String baseUrl = api.getBaseUrl(); final String baseUrl = api.getBaseUrl();
final Gson gson = new Gson();
final int pageSize = Optional final int pageSize = Optional
.ofNullable(api.getParams().get("pageSize")) .ofNullable(api.getParams().get("pageSize"))
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)

View File

@ -1,30 +1,15 @@
package eu.dnetlib.dhp.collection.plugin.osf; package eu.dnetlib.dhp.collection.plugin.osf;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.Iterator; import java.util.Iterator;
import java.util.Queue; import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.PriorityBlockingQueue;
import javax.xml.transform.OutputKeys; import org.dom4j.Document;
import javax.xml.transform.Transformer; import org.dom4j.DocumentHelper;
import javax.xml.transform.TransformerFactory; import org.dom4j.Element;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils; import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
@ -39,24 +24,11 @@ public class OsfPreprintsIterator implements Iterator<String> {
private final HttpClientParams clientParams; private final HttpClientParams clientParams;
private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
private final String baseUrl; private final String baseUrl;
private final int pageSize; private final int pageSize;
private int resumptionInt = 0; // integer resumption token (first record to harvest) private String currentUrl;
private int resultTotal = -1;
private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
// harvest
// or token scanned from results)
private InputStream resultStream;
private Transformer transformer;
private XPath xpath;
private String query;
private XPathExpression xprResultTotalPath;
private XPathExpression xprResumptionPath;
private XPathExpression xprEntity;
private final Queue<String> recordQueue = new PriorityBlockingQueue<>(); private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
public OsfPreprintsIterator( public OsfPreprintsIterator(
@ -68,38 +40,20 @@ public class OsfPreprintsIterator implements Iterator<String> {
this.baseUrl = baseUrl; this.baseUrl = baseUrl;
this.pageSize = pageSize; this.pageSize = pageSize;
try {
final TransformerFactory factory = TransformerFactory.newInstance();
this.transformer = factory.newTransformer();
this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
this.xpath = XPathFactory.newInstance().newXPath();
this.xprResultTotalPath = this.xpath.compile("/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']");
this.xprResumptionPath = this.xpath.compile("substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')");
this.xprEntity = this.xpath.compile("/*/*[local-name()='data']");
} catch (final Exception e) {
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
}
initQueue(); initQueue();
} }
private void initQueue() { private void initQueue() {
this.query = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize; this.currentUrl = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize;
log.info("REST calls starting with {}", this.query); log.info("REST calls starting with {}", this.currentUrl);
} }
/*
* (non-Javadoc)
*
* @see java.util.Iterator#hasNext()
*/
@Override @Override
public boolean hasNext() { public boolean hasNext() {
synchronized (this.recordQueue) { synchronized (this.recordQueue) {
while (this.recordQueue.isEmpty() && !this.query.isEmpty()) { while (this.recordQueue.isEmpty() && !this.currentUrl.isEmpty()) {
try { try {
this.query = downloadPage(this.query, 0); this.currentUrl = downloadPage(this.currentUrl, 0);
} catch (final CollectorException e) { } catch (final CollectorException e) {
log.debug("CollectorPlugin.next()-Exception: {}", e); log.debug("CollectorPlugin.next()-Exception: {}", e);
throw new RuntimeException(e); throw new RuntimeException(e);
@ -112,11 +66,6 @@ public class OsfPreprintsIterator implements Iterator<String> {
} }
} }
/*
* (non-Javadoc)
*
* @see java.util.Iterator#next()
*/
@Override @Override
public String next() { public String next() {
synchronized (this.recordQueue) { synchronized (this.recordQueue) {
@ -124,12 +73,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
} }
} }
/* private String downloadPage(final String url, final int attempt) throws CollectorException {
* download page and return nextQuery (with number of attempt)
*/
private String downloadPage(final String query, final int attempt) throws CollectorException {
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, query:" + query); } if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
if (attempt > 0) { if (attempt > 0) {
final int delay = (attempt * 5000); final int delay = (attempt * 5000);
@ -142,78 +88,35 @@ public class OsfPreprintsIterator implements Iterator<String> {
} }
try { try {
String resultJson; log.info("requesting URL [{}]", url);
String resultXml = XML_HEADER;
String nextQuery = "";
Node resultNode = null;
NodeList nodeList = null;
try { final HttpConnector2 connector = new HttpConnector2(this.clientParams);
log.info("requesting URL [{}]", query);
final HttpConnector2 connector = new HttpConnector2(this.clientParams); final String json = connector.getInputSource(url);
final String xml = JsonUtils.convertToXML(json);
resultJson = connector.getInputSource(query); final Document doc = DocumentHelper.parseText(xml);
resultXml = JsonUtils.convertToXML(resultJson);
this.resultStream = IOUtils.toInputStream(resultXml, StandardCharsets.UTF_8); for (final Object o : doc.selectNodes("/*/*[local-name()='data']")) {
final Element n = (Element) ((Element) o).detach();
if (!isEmptyXml(resultXml)) { for (final Object o1 : n.selectNodes(".//contributors//href")) {
resultNode = (Node) this.xpath // TODO ADD creators
.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE); }
nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET); for (final Object o1 : n.selectNodes(".//primary_file//href")) {
log.debug("nodeList.length: {}", nodeList.getLength()); // TODO ADD fulltexts
for (int i = 0; i < nodeList.getLength(); i++) {
final StringWriter sw = new StringWriter();
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
final String toEnqueue = sw.toString();
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
} else {
this.recordQueue.add(sw.toString());
}
}
} else {
log.warn("resultXml is equal with emptyXml");
} }
this.resumptionInt += this.pageSize; this.recordQueue.add(DocumentHelper.createDocument(n).asXML());
this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
} catch (final Exception e) {
log.error(e.getMessage(), e);
throw new IllegalStateException("collection failed: " + e.getMessage());
} }
try { return doc.valueOf("/*/*[local-name()='links']/*[local-name()='next']");
if (this.resultTotal == -1) {
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
log.info("resultTotal was -1 is now: " + this.resultTotal);
}
} catch (final Exception e) {
log.error(e.getMessage(), e);
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
}
log.debug("resultTotal: " + this.resultTotal);
log.debug("resInt: " + this.resumptionInt);
if (this.resumptionInt <= this.resultTotal) {
nextQuery = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize + "&page="
+ this.resumptionStr;
} else {
nextQuery = "";
}
log.debug("nextQueryUrl: " + nextQuery);
return nextQuery;
} catch (final Throwable e) { } catch (final Throwable e) {
log.warn(e.getMessage(), e); log.warn(e.getMessage(), e);
return downloadPage(query, attempt + 1); return downloadPage(url, attempt + 1);
} }
} }
private boolean isEmptyXml(final String s) {
return EMPTY_XML.equalsIgnoreCase(s);
}
} }

View File

@ -24,11 +24,6 @@ public class OsfPreprintsCollectorPluginTest {
private final String baseUrl = "https://api.osf.io/v2/preprints/"; private final String baseUrl = "https://api.osf.io/v2/preprints/";
// private final String requestHeaderMap = "";
// private final String authMethod = "";
// private final String authToken = "";
// private final String resultOutputFormat = "";
private final int pageSize = 100; private final int pageSize = 100;
private final ApiDescriptor api = new ApiDescriptor(); private final ApiDescriptor api = new ApiDescriptor();