partial implementation of osfPreprints plugin + tests

This commit is contained in:
Michele Artini 2024-09-19 13:58:53 +02:00
parent dcf09811a2
commit 9073b1159d
2 changed files with 18 additions and 53 deletions

View File

@ -3,8 +3,6 @@ package eu.dnetlib.dhp.collection.plugin.osf;
import java.io.InputStream; import java.io.InputStream;
import java.io.StringWriter; import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Iterator; import java.util.Iterator;
import java.util.Queue; import java.util.Queue;
@ -31,18 +29,18 @@ import org.xml.sax.InputSource;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils; import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class OsfPreprintsIterator implements Iterator<String> { public class OsfPreprintsIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(OsfPreprintsIterator.class); private static final Logger log = LoggerFactory.getLogger(OsfPreprintsIterator.class);
public static final String UTF_8 = "UTF-8";
private static final int MAX_ATTEMPTS = 5; private static final int MAX_ATTEMPTS = 5;
private final HttpClientParams clientParams; private final HttpClientParams clientParams;
private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
+ ">";
private final String baseUrl; private final String baseUrl;
private final int pageSize; private final int pageSize;
@ -91,10 +89,6 @@ public class OsfPreprintsIterator implements Iterator<String> {
log.info("REST calls starting with {}", this.query); log.info("REST calls starting with {}", this.query);
} }
private void disconnect() {
// TODO close inputstream
}
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
@ -114,7 +108,6 @@ public class OsfPreprintsIterator implements Iterator<String> {
if (!this.recordQueue.isEmpty()) { return true; } if (!this.recordQueue.isEmpty()) { return true; }
disconnect();
return false; return false;
} }
} }
@ -158,15 +151,12 @@ public class OsfPreprintsIterator implements Iterator<String> {
try { try {
log.info("requesting URL [{}]", query); log.info("requesting URL [{}]", query);
final URL qUrl = new URL(query); final HttpConnector2 connector = new HttpConnector2(this.clientParams);
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); resultJson = connector.getInputSource(query);
conn.setRequestMethod("GET");
this.resultStream = conn.getInputStream();
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
resultXml = JsonUtils.convertToXML(resultJson); resultXml = JsonUtils.convertToXML(resultJson);
this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
this.resultStream = IOUtils.toInputStream(resultXml, StandardCharsets.UTF_8);
if (!isEmptyXml(resultXml)) { if (!isEmptyXml(resultXml)) {
resultNode = (Node) this.xpath resultNode = (Node) this.xpath
@ -178,8 +168,7 @@ public class OsfPreprintsIterator implements Iterator<String> {
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
final String toEnqueue = sw.toString(); final String toEnqueue = sw.toString();
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) { if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
log log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
} else { } else {
this.recordQueue.add(sw.toString()); this.recordQueue.add(sw.toString());
} }
@ -213,8 +202,6 @@ public class OsfPreprintsIterator implements Iterator<String> {
+ this.resumptionStr; + this.resumptionStr;
} else { } else {
nextQuery = ""; nextQuery = "";
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
// resumptionInt and prevent a NullPointer Exception at mdStore
} }
log.debug("nextQueryUrl: " + nextQuery); log.debug("nextQueryUrl: " + nextQuery);
return nextQuery; return nextQuery;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection.plugin.rest; package eu.dnetlib.dhp.collection.plugin.osf;
import java.util.HashMap; import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
@ -18,9 +18,9 @@ import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.common.collection.HttpClientParams;
public class OsfPreprintCollectorTest { public class OsfPreprintsCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(OsfPreprintCollectorTest.class); private static final Logger log = LoggerFactory.getLogger(OsfPreprintsCollectorPlugin.class);
private final String baseUrl = "https://api.osf.io/v2/preprints/"; private final String baseUrl = "https://api.osf.io/v2/preprints/";
@ -29,50 +29,28 @@ public class OsfPreprintCollectorTest {
// private final String authToken = ""; // private final String authToken = "";
// private final String resultOutputFormat = ""; // private final String resultOutputFormat = "";
private final String queryParams = "filter:is_published:d=true"; private final int pageSize = 100;
private final String entityXpath = "/*/*[local-name()='data']";
private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";
private final String resumptionParam = "page";
private final String resumptionType = "scan";
private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";
private final String resultSizeParam = "page[size]";
private final String resultSizeValue = "100";
private final String resultFormatParam = "format";
private final String resultFormatValue = "json";
private final ApiDescriptor api = new ApiDescriptor(); private final ApiDescriptor api = new ApiDescriptor();
private RestCollectorPlugin rcp;
private OsfPreprintsCollectorPlugin plugin;
@BeforeEach @BeforeEach
public void setUp() { public void setUp() {
final HashMap<String, String> params = new HashMap<>(); final HashMap<String, String> params = new HashMap<>();
params.put("resumptionType", this.resumptionType); params.put("pageSize", "" + this.pageSize);
params.put("resumptionParam", this.resumptionParam);
params.put("resumptionXpath", this.resumptionXpath);
params.put("resultTotalXpath", this.resultTotalXpath);
params.put("resultFormatParam", this.resultFormatParam);
params.put("resultFormatValue", this.resultFormatValue);
params.put("resultSizeParam", this.resultSizeParam);
params.put("resultSizeValue", this.resultSizeValue);
params.put("queryParams", this.queryParams);
params.put("entityXpath", this.entityXpath);
this.api.setBaseUrl(this.baseUrl); this.api.setBaseUrl(this.baseUrl);
this.api.setParams(params); this.api.setParams(params);
this.rcp = new RestCollectorPlugin(new HttpClientParams()); this.plugin = new OsfPreprintsCollectorPlugin(new HttpClientParams());
} }
@Test @Test
@Disabled @Disabled
void test_limited() throws CollectorException { void test_limited() throws CollectorException {
final AtomicInteger i = new AtomicInteger(0); final AtomicInteger i = new AtomicInteger(0);
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport()); final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());
stream.limit(2000).forEach(s -> { stream.limit(2000).forEach(s -> {
Assertions.assertTrue(s.length() > 0); Assertions.assertTrue(s.length() > 0);
@ -88,7 +66,7 @@ public class OsfPreprintCollectorTest {
@Disabled @Disabled
void test_all() throws CollectorException { void test_all() throws CollectorException {
final AtomicLong i = new AtomicLong(0); final AtomicLong i = new AtomicLong(0);
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport()); final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());
stream.forEach(s -> { stream.forEach(s -> {
Assertions.assertTrue(s.length() > 0); Assertions.assertTrue(s.length() > 0);