forked from D-Net/dnet-hadoop
merged from the osfPreprints_plugin branch
This commit is contained in:
commit
4e9f64e01a
|
@ -1,4 +1,3 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.*;
|
||||
|
|
|
@ -69,12 +69,9 @@ public class CollectorWorker extends ReportingJob {
|
|||
scheduleReport(counter);
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
||||
.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer
|
||||
.valueClass(Text.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||
.createWriter(this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
||||
.keyClass(IntWritable.class), SequenceFile.Writer
|
||||
.valueClass(Text.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||
final IntWritable key = new IntWritable(counter.get());
|
||||
final Text value = new Text();
|
||||
plugin
|
||||
|
|
|
@ -36,9 +36,7 @@ public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
|
|||
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
|
||||
.orElse(PAGE_SIZE_VALUE_DEFAULT);
|
||||
|
||||
if (StringUtils.isBlank(baseUrl)) {
|
||||
throw new CollectorException("Param 'baseUrl' is null or empty");
|
||||
}
|
||||
if (StringUtils.isBlank(baseUrl)) { throw new CollectorException("Param 'baseUrl' is null or empty"); }
|
||||
|
||||
final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams());
|
||||
|
||||
|
|
|
@ -54,8 +54,7 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
@Override
|
||||
public boolean hasNext() {
|
||||
synchronized (this.recordQueue) {
|
||||
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl)
|
||||
&& this.currentUrl.startsWith("http")) {
|
||||
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl) && this.currentUrl.startsWith("http")) {
|
||||
try {
|
||||
this.currentUrl = downloadPage(this.currentUrl);
|
||||
} catch (final CollectorException e) {
|
||||
|
@ -64,9 +63,7 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
}
|
||||
}
|
||||
|
||||
if (!this.recordQueue.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
if (!this.recordQueue.isEmpty()) { return true; }
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -115,9 +112,7 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
}
|
||||
|
||||
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
|
||||
if (attempt > MAX_ATTEMPTS) {
|
||||
throw new CollectorException("Max Number of attempts reached, url:" + url);
|
||||
}
|
||||
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
|
||||
|
||||
if (attempt > 0) {
|
||||
final int delay = (attempt * 5000);
|
||||
|
|
|
@ -35,18 +35,6 @@
|
|||
"paramRequired": true
|
||||
}
|
||||
,
|
||||
{
|
||||
"paramName": "wip",
|
||||
"paramLongName": "webCrawlInputPath",
|
||||
"paramDescription": "the path to get the input data from Web Crawl",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pip",
|
||||
"paramLongName": "publisherInputPath",
|
||||
"paramDescription": "the path to get the input data from publishers",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
|
|
|
@ -50,8 +50,7 @@ public class OsfPreprintsCollectorPluginTest {
|
|||
@Test
|
||||
@Disabled
|
||||
void test_one() throws CollectorException {
|
||||
this.plugin
|
||||
.collect(this.api, new AggregatorReport())
|
||||
this.plugin.collect(this.api, new AggregatorReport())
|
||||
.limit(1)
|
||||
.forEach(log::info);
|
||||
}
|
||||
|
@ -96,8 +95,7 @@ public class OsfPreprintsCollectorPluginTest {
|
|||
final HttpConnector2 connector = new HttpConnector2();
|
||||
|
||||
try {
|
||||
final String res = connector
|
||||
.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
||||
final String res = connector.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
||||
System.out.println(res);
|
||||
fail();
|
||||
} catch (final Throwable e) {
|
||||
|
|
|
@ -29,7 +29,7 @@ class JsonPathTest {
|
|||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
|
||||
System.out.println("row = " + row.getAs("country"));
|
||||
System.out.println("row = " + row.getAs("countrytitle"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue