1
0
Fork 0

osf plugin: links to contributors and primaty_file

This commit is contained in:
Michele Artini 2024-09-20 08:44:05 +02:00
parent 52bb7af03b
commit 339d8124f2
2 changed files with 41 additions and 22 deletions

View File

@ -8,6 +8,7 @@ import java.util.concurrent.PriorityBlockingQueue;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentHelper; import org.dom4j.DocumentHelper;
import org.dom4j.Element; import org.dom4j.Element;
import org.dom4j.Node;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -53,7 +54,7 @@ public class OsfPreprintsIterator implements Iterator<String> {
synchronized (this.recordQueue) { synchronized (this.recordQueue) {
while (this.recordQueue.isEmpty() && !this.currentUrl.isEmpty()) { while (this.recordQueue.isEmpty() && !this.currentUrl.isEmpty()) {
try { try {
this.currentUrl = downloadPage(this.currentUrl, 0); this.currentUrl = downloadPage(this.currentUrl);
} catch (final CollectorException e) { } catch (final CollectorException e) {
log.debug("CollectorPlugin.next()-Exception: {}", e); log.debug("CollectorPlugin.next()-Exception: {}", e);
throw new RuntimeException(e); throw new RuntimeException(e);
@ -73,8 +74,36 @@ public class OsfPreprintsIterator implements Iterator<String> {
} }
} }
private String downloadPage(final String url, final int attempt) throws CollectorException { private String downloadPage(final String url) throws CollectorException {
final Document doc = downloadUrl(url, 0);
for (final Object o : doc.selectNodes("/*/data")) {
final Element n = (Element) ((Element) o).detach();
final Element group = DocumentHelper.createElement("group");
group.addAttribute("id", n.valueOf(".//data/id"));
group.addElement("preprint").add(n);
for (final Object o1 : n.selectNodes(".//contributors//href")) {
final Document doc1 = downloadUrl(((Node) o1).getText(), 0);
group.addElement("contributors").add(doc1.getRootElement().detach());
}
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
final Document doc1 = downloadUrl(((Node) o1).getText(), 0);
group.addElement("primary_file").add(doc1.getRootElement().detach());
}
this.recordQueue.add(DocumentHelper.createDocument(group).asXML());
}
return doc.valueOf("/*/links/next");
}
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); } if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
if (attempt > 0) { if (attempt > 0) {
@ -95,28 +124,10 @@ public class OsfPreprintsIterator implements Iterator<String> {
final String json = connector.getInputSource(url); final String json = connector.getInputSource(url);
final String xml = JsonUtils.convertToXML(json); final String xml = JsonUtils.convertToXML(json);
final Document doc = DocumentHelper.parseText(xml); return DocumentHelper.parseText(xml);
for (final Object o : doc.selectNodes("/*/*[local-name()='data']")) {
final Element n = (Element) ((Element) o).detach();
for (final Object o1 : n.selectNodes(".//contributors//href")) {
// TODO ADD creators
}
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
// TODO ADD fulltexts
}
this.recordQueue.add(DocumentHelper.createDocument(n).asXML());
}
return doc.valueOf("/*/*[local-name()='links']/*[local-name()='next']");
} catch (final Throwable e) { } catch (final Throwable e) {
log.warn(e.getMessage(), e); log.warn(e.getMessage(), e);
return downloadPage(url, attempt + 1); return downloadUrl(url, attempt + 1);
} }
} }
} }

View File

@ -43,6 +43,14 @@ public class OsfPreprintsCollectorPluginTest {
@Test @Test
@Disabled @Disabled
void test_one() throws CollectorException {
this.plugin.collect(this.api, new AggregatorReport())
.limit(1)
.forEach(log::info);
}
@Test
// @Disabled
void test_limited() throws CollectorException { void test_limited() throws CollectorException {
final AtomicInteger i = new AtomicInteger(0); final AtomicInteger i = new AtomicInteger(0);
final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport()); final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());