forked from D-Net/dnet-hadoop
Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta
This commit is contained in:
commit
599e56dbc6
|
@ -212,11 +212,11 @@ public class HttpConnector2 {
|
||||||
.format(
|
.format(
|
||||||
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
||||||
MAPPER.writeValueAsString(report)));
|
MAPPER.writeValueAsString(report)));
|
||||||
} catch (MalformedURLException | UnknownHostException e) {
|
} catch (MalformedURLException e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
report.put(e.getClass().getName(), e.getMessage());
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
throw new CollectorException(e.getMessage(), e);
|
throw new CollectorException(e.getMessage(), e);
|
||||||
} catch (SocketTimeoutException | SocketException e) {
|
} catch (SocketTimeoutException | SocketException | UnknownHostException e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
report.put(e.getClass().getName(), e.getMessage());
|
report.put(e.getClass().getName(), e.getMessage());
|
||||||
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
||||||
|
|
|
@ -49,6 +49,9 @@ public class ReadCOCI implements Serializable {
|
||||||
final String workingPath = parser.get("inputPath");
|
final String workingPath = parser.get("inputPath");
|
||||||
log.info("workingPath {}", workingPath);
|
log.info("workingPath {}", workingPath);
|
||||||
|
|
||||||
|
final String backupPath = parser.get("backupPath");
|
||||||
|
log.info("backupPath {}", backupPath);
|
||||||
|
|
||||||
SparkConf sconf = new SparkConf();
|
SparkConf sconf = new SparkConf();
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
|
@ -68,12 +71,14 @@ public class ReadCOCI implements Serializable {
|
||||||
workingPath,
|
workingPath,
|
||||||
fileSystem,
|
fileSystem,
|
||||||
outputPath,
|
outputPath,
|
||||||
|
backupPath,
|
||||||
delimiter);
|
delimiter);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
||||||
String outputPath,
|
String outputPath,
|
||||||
|
String backupPath,
|
||||||
String delimiter) throws IOException {
|
String delimiter) throws IOException {
|
||||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||||
.listFiles(
|
.listFiles(
|
||||||
|
@ -107,7 +112,8 @@ public class ReadCOCI implements Serializable {
|
||||||
.mode(SaveMode.Append)
|
.mode(SaveMode.Append)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE"));
|
|
||||||
|
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,9 +69,12 @@ public class CollectorWorker extends ReportingJob {
|
||||||
scheduleReport(counter);
|
scheduleReport(counter);
|
||||||
|
|
||||||
try (SequenceFile.Writer writer = SequenceFile
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
.createWriter(this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
.createWriter(
|
||||||
.keyClass(IntWritable.class), SequenceFile.Writer
|
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
||||||
.valueClass(Text.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
.keyClass(IntWritable.class),
|
||||||
|
SequenceFile.Writer
|
||||||
|
.valueClass(Text.class),
|
||||||
|
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||||
final IntWritable key = new IntWritable(counter.get());
|
final IntWritable key = new IntWritable(counter.get());
|
||||||
final Text value = new Text();
|
final Text value = new Text();
|
||||||
plugin
|
plugin
|
||||||
|
|
|
@ -36,7 +36,9 @@ public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
|
||||||
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
|
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
|
||||||
.orElse(PAGE_SIZE_VALUE_DEFAULT);
|
.orElse(PAGE_SIZE_VALUE_DEFAULT);
|
||||||
|
|
||||||
if (StringUtils.isBlank(baseUrl)) { throw new CollectorException("Param 'baseUrl' is null or empty"); }
|
if (StringUtils.isBlank(baseUrl)) {
|
||||||
|
throw new CollectorException("Param 'baseUrl' is null or empty");
|
||||||
|
}
|
||||||
|
|
||||||
final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams());
|
final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams());
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ import java.util.Iterator;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentHelper;
|
import org.dom4j.DocumentHelper;
|
||||||
import org.dom4j.Element;
|
import org.dom4j.Element;
|
||||||
|
@ -46,13 +47,15 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
|
|
||||||
private void initQueue() {
|
private void initQueue() {
|
||||||
this.currentUrl = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize;
|
this.currentUrl = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize;
|
||||||
|
|
||||||
log.info("REST calls starting with {}", this.currentUrl);
|
log.info("REST calls starting with {}", this.currentUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
synchronized (this.recordQueue) {
|
synchronized (this.recordQueue) {
|
||||||
while (this.recordQueue.isEmpty() && !this.currentUrl.isEmpty()) {
|
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl)
|
||||||
|
&& this.currentUrl.startsWith("http")) {
|
||||||
try {
|
try {
|
||||||
this.currentUrl = downloadPage(this.currentUrl);
|
this.currentUrl = downloadPage(this.currentUrl);
|
||||||
} catch (final CollectorException e) {
|
} catch (final CollectorException e) {
|
||||||
|
@ -61,7 +64,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.recordQueue.isEmpty()) { return true; }
|
if (!this.recordQueue.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -83,18 +88,24 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
final Element n = (Element) ((Element) o).detach();
|
final Element n = (Element) ((Element) o).detach();
|
||||||
|
|
||||||
final Element group = DocumentHelper.createElement("group");
|
final Element group = DocumentHelper.createElement("group");
|
||||||
group.addAttribute("id", n.valueOf(".//data/id"));
|
group.addAttribute("id", n.valueOf("./id"));
|
||||||
|
|
||||||
group.addElement("preprint").add(n);
|
group.addElement("preprint").add(n);
|
||||||
|
|
||||||
for (final Object o1 : n.selectNodes(".//contributors//href")) {
|
for (final Object o1 : n.selectNodes(".//contributors//href")) {
|
||||||
final Document doc1 = downloadUrl(((Node) o1).getText(), 0);
|
final String href = ((Node) o1).getText();
|
||||||
|
if (StringUtils.isNotBlank(href) && href.startsWith("http")) {
|
||||||
|
final Document doc1 = downloadUrl(href, 0);
|
||||||
group.addElement("contributors").add(doc1.getRootElement().detach());
|
group.addElement("contributors").add(doc1.getRootElement().detach());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
|
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
|
||||||
final Document doc1 = downloadUrl(((Node) o1).getText(), 0);
|
final String href = ((Node) o1).getText();
|
||||||
|
if (StringUtils.isNotBlank(href) && href.startsWith("http")) {
|
||||||
|
final Document doc1 = downloadUrl(href, 0);
|
||||||
group.addElement("primary_file").add(doc1.getRootElement().detach());
|
group.addElement("primary_file").add(doc1.getRootElement().detach());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
this.recordQueue.add(DocumentHelper.createDocument(group).asXML());
|
this.recordQueue.add(DocumentHelper.createDocument(group).asXML());
|
||||||
}
|
}
|
||||||
|
@ -104,7 +115,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
|
|
||||||
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
|
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
|
||||||
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
|
if (attempt > MAX_ATTEMPTS) {
|
||||||
|
throw new CollectorException("Max Number of attempts reached, url:" + url);
|
||||||
|
}
|
||||||
|
|
||||||
if (attempt > 0) {
|
if (attempt > 0) {
|
||||||
final int delay = (attempt * 5000);
|
final int delay = (attempt * 5000);
|
||||||
|
|
|
@ -24,12 +24,19 @@
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
}, {
|
},
|
||||||
|
{
|
||||||
"paramName": "nn",
|
"paramName": "nn",
|
||||||
"paramLongName": "hdfsNameNode",
|
"paramLongName": "hdfsNameNode",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
}
|
},
|
||||||
|
{
|
||||||
|
"paramName": "bp",
|
||||||
|
"paramLongName": "backupPath",
|
||||||
|
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -129,6 +129,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||||
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
||||||
|
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
||||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
|
@ -504,6 +504,24 @@ case object Crossref2Oaf {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val is_review = json \ "relation" \ "is-review-of" \ "id"
|
||||||
|
|
||||||
|
if (is_review != JNothing) {
|
||||||
|
instance.setInstancetype(
|
||||||
|
OafMapperUtils.qualifier(
|
||||||
|
"0015",
|
||||||
|
"peerReviewed",
|
||||||
|
ModelConstants.DNET_REVIEW_LEVELS,
|
||||||
|
ModelConstants.DNET_REVIEW_LEVELS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doi.startsWith("10.3410") || doi.startsWith("10.12703"))
|
||||||
|
instance.setHostedby(
|
||||||
|
OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect")
|
||||||
|
)
|
||||||
|
|
||||||
instance.setAccessright(
|
instance.setAccessright(
|
||||||
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
|
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
|
||||||
)
|
)
|
||||||
|
|
|
@ -18,6 +18,7 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
|
@ -49,7 +50,8 @@ public class OsfPreprintsCollectorPluginTest {
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
void test_one() throws CollectorException {
|
void test_one() throws CollectorException {
|
||||||
this.plugin.collect(this.api, new AggregatorReport())
|
this.plugin
|
||||||
|
.collect(this.api, new AggregatorReport())
|
||||||
.limit(1)
|
.limit(1)
|
||||||
.forEach(log::info);
|
.forEach(log::info);
|
||||||
}
|
}
|
||||||
|
@ -94,7 +96,8 @@ public class OsfPreprintsCollectorPluginTest {
|
||||||
final HttpConnector2 connector = new HttpConnector2();
|
final HttpConnector2 connector = new HttpConnector2();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
final String res = connector.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
final String res = connector
|
||||||
|
.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
||||||
System.out.println(res);
|
System.out.println(res);
|
||||||
fail();
|
fail();
|
||||||
} catch (final Throwable e) {
|
} catch (final Throwable e) {
|
||||||
|
@ -110,4 +113,10 @@ public class OsfPreprintsCollectorPluginTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testXML() {
|
||||||
|
final String xml = JsonUtils.convertToXML("{'next':null}");
|
||||||
|
System.out.println(xml);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue