osf plugin: links to contributors and primaty_file
This commit is contained in:
parent
52bb7af03b
commit
339d8124f2
|
@ -8,6 +8,7 @@ import java.util.concurrent.PriorityBlockingQueue;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentHelper;
|
import org.dom4j.DocumentHelper;
|
||||||
import org.dom4j.Element;
|
import org.dom4j.Element;
|
||||||
|
import org.dom4j.Node;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -53,7 +54,7 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
synchronized (this.recordQueue) {
|
synchronized (this.recordQueue) {
|
||||||
while (this.recordQueue.isEmpty() && !this.currentUrl.isEmpty()) {
|
while (this.recordQueue.isEmpty() && !this.currentUrl.isEmpty()) {
|
||||||
try {
|
try {
|
||||||
this.currentUrl = downloadPage(this.currentUrl, 0);
|
this.currentUrl = downloadPage(this.currentUrl);
|
||||||
} catch (final CollectorException e) {
|
} catch (final CollectorException e) {
|
||||||
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
@ -73,8 +74,36 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String downloadPage(final String url, final int attempt) throws CollectorException {
|
private String downloadPage(final String url) throws CollectorException {
|
||||||
|
|
||||||
|
final Document doc = downloadUrl(url, 0);
|
||||||
|
|
||||||
|
for (final Object o : doc.selectNodes("/*/data")) {
|
||||||
|
|
||||||
|
final Element n = (Element) ((Element) o).detach();
|
||||||
|
|
||||||
|
final Element group = DocumentHelper.createElement("group");
|
||||||
|
group.addAttribute("id", n.valueOf(".//data/id"));
|
||||||
|
|
||||||
|
group.addElement("preprint").add(n);
|
||||||
|
|
||||||
|
for (final Object o1 : n.selectNodes(".//contributors//href")) {
|
||||||
|
final Document doc1 = downloadUrl(((Node) o1).getText(), 0);
|
||||||
|
group.addElement("contributors").add(doc1.getRootElement().detach());
|
||||||
|
}
|
||||||
|
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
|
||||||
|
final Document doc1 = downloadUrl(((Node) o1).getText(), 0);
|
||||||
|
group.addElement("primary_file").add(doc1.getRootElement().detach());
|
||||||
|
}
|
||||||
|
|
||||||
|
this.recordQueue.add(DocumentHelper.createDocument(group).asXML());
|
||||||
|
}
|
||||||
|
|
||||||
|
return doc.valueOf("/*/links/next");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
|
||||||
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
|
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
|
||||||
|
|
||||||
if (attempt > 0) {
|
if (attempt > 0) {
|
||||||
|
@ -95,28 +124,10 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
final String json = connector.getInputSource(url);
|
final String json = connector.getInputSource(url);
|
||||||
final String xml = JsonUtils.convertToXML(json);
|
final String xml = JsonUtils.convertToXML(json);
|
||||||
|
|
||||||
final Document doc = DocumentHelper.parseText(xml);
|
return DocumentHelper.parseText(xml);
|
||||||
|
|
||||||
for (final Object o : doc.selectNodes("/*/*[local-name()='data']")) {
|
|
||||||
final Element n = (Element) ((Element) o).detach();
|
|
||||||
|
|
||||||
for (final Object o1 : n.selectNodes(".//contributors//href")) {
|
|
||||||
// TODO ADD creators
|
|
||||||
}
|
|
||||||
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
|
|
||||||
// TODO ADD fulltexts
|
|
||||||
}
|
|
||||||
|
|
||||||
this.recordQueue.add(DocumentHelper.createDocument(n).asXML());
|
|
||||||
}
|
|
||||||
|
|
||||||
return doc.valueOf("/*/*[local-name()='links']/*[local-name()='next']");
|
|
||||||
|
|
||||||
} catch (final Throwable e) {
|
} catch (final Throwable e) {
|
||||||
log.warn(e.getMessage(), e);
|
log.warn(e.getMessage(), e);
|
||||||
return downloadPage(url, attempt + 1);
|
return downloadUrl(url, attempt + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,6 +43,14 @@ public class OsfPreprintsCollectorPluginTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
|
void test_one() throws CollectorException {
|
||||||
|
this.plugin.collect(this.api, new AggregatorReport())
|
||||||
|
.limit(1)
|
||||||
|
.forEach(log::info);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
// @Disabled
|
||||||
void test_limited() throws CollectorException {
|
void test_limited() throws CollectorException {
|
||||||
final AtomicInteger i = new AtomicInteger(0);
|
final AtomicInteger i = new AtomicInteger(0);
|
||||||
final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());
|
final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());
|
||||||
|
|
Loading…
Reference in New Issue