use of dom4j
This commit is contained in:
parent
9073b1159d
commit
52bb7af03b
|
@ -10,8 +10,6 @@ import java.util.stream.StreamSupport;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
|
@ -32,8 +30,6 @@ public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
|
||||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
||||||
final String baseUrl = api.getBaseUrl();
|
final String baseUrl = api.getBaseUrl();
|
||||||
|
|
||||||
final Gson gson = new Gson();
|
|
||||||
|
|
||||||
final int pageSize = Optional
|
final int pageSize = Optional
|
||||||
.ofNullable(api.getParams().get("pageSize"))
|
.ofNullable(api.getParams().get("pageSize"))
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
|
|
|
@ -1,30 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.osf;
|
package eu.dnetlib.dhp.collection.plugin.osf;
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.StringWriter;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
|
||||||
import javax.xml.transform.OutputKeys;
|
import org.dom4j.Document;
|
||||||
import javax.xml.transform.Transformer;
|
import org.dom4j.DocumentHelper;
|
||||||
import javax.xml.transform.TransformerFactory;
|
import org.dom4j.Element;
|
||||||
import javax.xml.transform.dom.DOMSource;
|
|
||||||
import javax.xml.transform.stream.StreamResult;
|
|
||||||
import javax.xml.xpath.XPath;
|
|
||||||
import javax.xml.xpath.XPathConstants;
|
|
||||||
import javax.xml.xpath.XPathExpression;
|
|
||||||
import javax.xml.xpath.XPathFactory;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.w3c.dom.Node;
|
|
||||||
import org.w3c.dom.NodeList;
|
|
||||||
import org.xml.sax.InputSource;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
@ -39,24 +24,11 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
|
|
||||||
private final HttpClientParams clientParams;
|
private final HttpClientParams clientParams;
|
||||||
|
|
||||||
private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
|
||||||
private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
|
||||||
|
|
||||||
private final String baseUrl;
|
private final String baseUrl;
|
||||||
private final int pageSize;
|
private final int pageSize;
|
||||||
|
|
||||||
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
private String currentUrl;
|
||||||
private int resultTotal = -1;
|
|
||||||
private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
|
|
||||||
// harvest
|
|
||||||
// or token scanned from results)
|
|
||||||
private InputStream resultStream;
|
|
||||||
private Transformer transformer;
|
|
||||||
private XPath xpath;
|
|
||||||
private String query;
|
|
||||||
private XPathExpression xprResultTotalPath;
|
|
||||||
private XPathExpression xprResumptionPath;
|
|
||||||
private XPathExpression xprEntity;
|
|
||||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
||||||
|
|
||||||
public OsfPreprintsIterator(
|
public OsfPreprintsIterator(
|
||||||
|
@ -68,38 +40,20 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
this.baseUrl = baseUrl;
|
this.baseUrl = baseUrl;
|
||||||
this.pageSize = pageSize;
|
this.pageSize = pageSize;
|
||||||
|
|
||||||
try {
|
|
||||||
final TransformerFactory factory = TransformerFactory.newInstance();
|
|
||||||
this.transformer = factory.newTransformer();
|
|
||||||
this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
|
||||||
this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
|
||||||
this.xpath = XPathFactory.newInstance().newXPath();
|
|
||||||
this.xprResultTotalPath = this.xpath.compile("/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']");
|
|
||||||
this.xprResumptionPath = this.xpath.compile("substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')");
|
|
||||||
this.xprEntity = this.xpath.compile("/*/*[local-name()='data']");
|
|
||||||
} catch (final Exception e) {
|
|
||||||
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
|
||||||
}
|
|
||||||
|
|
||||||
initQueue();
|
initQueue();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initQueue() {
|
private void initQueue() {
|
||||||
this.query = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize;
|
this.currentUrl = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize;
|
||||||
log.info("REST calls starting with {}", this.query);
|
log.info("REST calls starting with {}", this.currentUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
*
|
|
||||||
* @see java.util.Iterator#hasNext()
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
synchronized (this.recordQueue) {
|
synchronized (this.recordQueue) {
|
||||||
while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
|
while (this.recordQueue.isEmpty() && !this.currentUrl.isEmpty()) {
|
||||||
try {
|
try {
|
||||||
this.query = downloadPage(this.query, 0);
|
this.currentUrl = downloadPage(this.currentUrl, 0);
|
||||||
} catch (final CollectorException e) {
|
} catch (final CollectorException e) {
|
||||||
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
@ -112,11 +66,6 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
*
|
|
||||||
* @see java.util.Iterator#next()
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public String next() {
|
public String next() {
|
||||||
synchronized (this.recordQueue) {
|
synchronized (this.recordQueue) {
|
||||||
|
@ -124,12 +73,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
private String downloadPage(final String url, final int attempt) throws CollectorException {
|
||||||
* download page and return nextQuery (with number of attempt)
|
|
||||||
*/
|
|
||||||
private String downloadPage(final String query, final int attempt) throws CollectorException {
|
|
||||||
|
|
||||||
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, query:" + query); }
|
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
|
||||||
|
|
||||||
if (attempt > 0) {
|
if (attempt > 0) {
|
||||||
final int delay = (attempt * 5000);
|
final int delay = (attempt * 5000);
|
||||||
|
@ -142,78 +88,35 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String resultJson;
|
log.info("requesting URL [{}]", url);
|
||||||
String resultXml = XML_HEADER;
|
|
||||||
String nextQuery = "";
|
|
||||||
Node resultNode = null;
|
|
||||||
NodeList nodeList = null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
log.info("requesting URL [{}]", query);
|
|
||||||
|
|
||||||
final HttpConnector2 connector = new HttpConnector2(this.clientParams);
|
final HttpConnector2 connector = new HttpConnector2(this.clientParams);
|
||||||
|
|
||||||
resultJson = connector.getInputSource(query);
|
final String json = connector.getInputSource(url);
|
||||||
resultXml = JsonUtils.convertToXML(resultJson);
|
final String xml = JsonUtils.convertToXML(json);
|
||||||
|
|
||||||
this.resultStream = IOUtils.toInputStream(resultXml, StandardCharsets.UTF_8);
|
final Document doc = DocumentHelper.parseText(xml);
|
||||||
|
|
||||||
if (!isEmptyXml(resultXml)) {
|
for (final Object o : doc.selectNodes("/*/*[local-name()='data']")) {
|
||||||
resultNode = (Node) this.xpath
|
final Element n = (Element) ((Element) o).detach();
|
||||||
.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
|
|
||||||
nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
for (final Object o1 : n.selectNodes(".//contributors//href")) {
|
||||||
log.debug("nodeList.length: {}", nodeList.getLength());
|
// TODO ADD creators
|
||||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
|
||||||
final StringWriter sw = new StringWriter();
|
|
||||||
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
|
||||||
final String toEnqueue = sw.toString();
|
|
||||||
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
|
|
||||||
log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
|
|
||||||
} else {
|
|
||||||
this.recordQueue.add(sw.toString());
|
|
||||||
}
|
}
|
||||||
}
|
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
|
||||||
} else {
|
// TODO ADD fulltexts
|
||||||
log.warn("resultXml is equal with emptyXml");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
this.resumptionInt += this.pageSize;
|
this.recordQueue.add(DocumentHelper.createDocument(n).asXML());
|
||||||
|
|
||||||
this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
|
|
||||||
|
|
||||||
} catch (final Exception e) {
|
|
||||||
log.error(e.getMessage(), e);
|
|
||||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
return doc.valueOf("/*/*[local-name()='links']/*[local-name()='next']");
|
||||||
if (this.resultTotal == -1) {
|
|
||||||
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
|
|
||||||
log.info("resultTotal was -1 is now: " + this.resultTotal);
|
|
||||||
}
|
|
||||||
} catch (final Exception e) {
|
|
||||||
log.error(e.getMessage(), e);
|
|
||||||
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
|
||||||
}
|
|
||||||
log.debug("resultTotal: " + this.resultTotal);
|
|
||||||
log.debug("resInt: " + this.resumptionInt);
|
|
||||||
if (this.resumptionInt <= this.resultTotal) {
|
|
||||||
nextQuery = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize + "&page="
|
|
||||||
+ this.resumptionStr;
|
|
||||||
} else {
|
|
||||||
nextQuery = "";
|
|
||||||
}
|
|
||||||
log.debug("nextQueryUrl: " + nextQuery);
|
|
||||||
return nextQuery;
|
|
||||||
} catch (final Throwable e) {
|
} catch (final Throwable e) {
|
||||||
log.warn(e.getMessage(), e);
|
log.warn(e.getMessage(), e);
|
||||||
return downloadPage(query, attempt + 1);
|
return downloadPage(url, attempt + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isEmptyXml(final String s) {
|
|
||||||
return EMPTY_XML.equalsIgnoreCase(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,11 +24,6 @@ public class OsfPreprintsCollectorPluginTest {
|
||||||
|
|
||||||
private final String baseUrl = "https://api.osf.io/v2/preprints/";
|
private final String baseUrl = "https://api.osf.io/v2/preprints/";
|
||||||
|
|
||||||
// private final String requestHeaderMap = "";
|
|
||||||
// private final String authMethod = "";
|
|
||||||
// private final String authToken = "";
|
|
||||||
// private final String resultOutputFormat = "";
|
|
||||||
|
|
||||||
private final int pageSize = 100;
|
private final int pageSize = 100;
|
||||||
|
|
||||||
private final ApiDescriptor api = new ApiDescriptor();
|
private final ApiDescriptor api = new ApiDescriptor();
|
||||||
|
|
Loading…
Reference in New Issue