forked from D-Net/dnet-hadoop
fixed missing parameter on download update
This commit is contained in:
parent
859babf722
commit
e328bc0ade
|
@ -1,6 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.orcid;
|
||||
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import javax.swing.*;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
|
@ -10,12 +18,7 @@ import org.jetbrains.annotations.NotNull;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.swing.*;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
|
||||
public class ORCIDWorker extends Thread {
|
||||
|
||||
|
@ -42,7 +45,8 @@ public class ORCIDWorker extends Thread {
|
|||
return new ORCIDWorkerBuilder();
|
||||
}
|
||||
|
||||
public ORCIDWorker(String id, BlockingQueue<String> myqueue, SequenceFile.Writer employments, SequenceFile.Writer summary, SequenceFile.Writer works, String token) {
|
||||
public ORCIDWorker(String id, BlockingQueue<String> myqueue, SequenceFile.Writer employments,
|
||||
SequenceFile.Writer summary, SequenceFile.Writer works, String token) {
|
||||
this.id = id;
|
||||
this.queue = myqueue;
|
||||
this.employments = employments;
|
||||
|
@ -51,7 +55,6 @@ public class ORCIDWorker extends Thread {
|
|||
this.token = token;
|
||||
}
|
||||
|
||||
|
||||
public static String retrieveURL(final String id, final String apiUrl, String token) {
|
||||
try {
|
||||
final HttpURLConnection urlConn = getHttpURLConnection(apiUrl, token);
|
||||
|
@ -59,7 +62,10 @@ public class ORCIDWorker extends Thread {
|
|||
InputStream input = urlConn.getInputStream();
|
||||
return IOUtils.toString(input);
|
||||
} else {
|
||||
log.error("Thread {} UNABLE TO DOWNLOAD FROM THIS URL {} , status code {}",id, apiUrl,urlConn.getResponseCode());
|
||||
log
|
||||
.error(
|
||||
"Thread {} UNABLE TO DOWNLOAD FROM THIS URL {} , status code {}", id, apiUrl,
|
||||
urlConn.getResponseCode());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Thread {} Error on retrieving URL {} {}", id, apiUrl, e);
|
||||
|
@ -86,12 +92,13 @@ public class ORCIDWorker extends Thread {
|
|||
private static String generateWorksURL(final String orcidId) {
|
||||
return "https://api.orcid.org/v3.0/" + orcidId + "/works";
|
||||
}
|
||||
|
||||
private static String generateEmploymentsURL(final String orcidId) {
|
||||
return "https://api.orcid.org/v3.0/" + orcidId + "/employments";
|
||||
}
|
||||
|
||||
|
||||
private static void writeResultToSequenceFile(String id, String url, String token, String orcidId, SequenceFile.Writer file) throws IOException {
|
||||
private static void writeResultToSequenceFile(String id, String url, String token, String orcidId,
|
||||
SequenceFile.Writer file) throws IOException {
|
||||
final String response = retrieveURL(id, url, token);
|
||||
if (response != null) {
|
||||
|
||||
|
@ -102,14 +109,12 @@ public class ORCIDWorker extends Thread {
|
|||
|
||||
if (file == null) {
|
||||
log.error("Thread {} file is null for {} URL:{}", id, url, orcidId);
|
||||
}
|
||||
else
|
||||
} else
|
||||
file.append(new Text(orcidId), new Text(response));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
final Text key = new Text();
|
||||
|
@ -135,7 +140,8 @@ public class ORCIDWorker extends Thread {
|
|||
total_time = System.currentTimeMillis() - start;
|
||||
requests++;
|
||||
if (total_time < 1000) {
|
||||
//I know making a sleep on a thread is bad, but we need to stay to 24 requests per seconds, hence
|
||||
// I know making a sleep on a thread is bad, but we need to stay to 24 requests per seconds,
|
||||
// hence
|
||||
// the time between two http request in a thread must be 1 second
|
||||
Thread.sleep(1000L - total_time);
|
||||
}
|
||||
|
@ -144,7 +150,8 @@ public class ORCIDWorker extends Thread {
|
|||
total_time = System.currentTimeMillis() - start;
|
||||
requests++;
|
||||
if (total_time < 1000) {
|
||||
//I know making a sleep on a thread is bad, but we need to stay to 24 requests per seconds, hence
|
||||
// I know making a sleep on a thread is bad, but we need to stay to 24 requests per seconds,
|
||||
// hence
|
||||
// the time between two http request in a thread must be 1 second
|
||||
Thread.sleep(1000L - total_time);
|
||||
}
|
||||
|
@ -153,17 +160,16 @@ public class ORCIDWorker extends Thread {
|
|||
total_time = System.currentTimeMillis() - start;
|
||||
requests++;
|
||||
if (total_time < 1000) {
|
||||
//I know making a sleep on a thread is bad, but we need to stay to 24 requests per seconds, hence
|
||||
// I know making a sleep on a thread is bad, but we need to stay to 24 requests per seconds,
|
||||
// hence
|
||||
// the time between two http request in a thread must be 1 second
|
||||
Thread.sleep(1000L - total_time);
|
||||
}
|
||||
if (requests %30 ==0)
|
||||
{
|
||||
if (requests % 30 == 0) {
|
||||
log.info("Thread {} Downloaded {}", id, requests);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} catch (Throwable e) {
|
||||
|
||||
log.error("Thread {} Unable to save ORICD: {} item error", id, orcidId, e);
|
||||
|
@ -179,13 +185,11 @@ public class ORCIDWorker extends Thread {
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
|
||||
log.info("Thread {} COMPLETE ", id);
|
||||
log.info("Thread {} Downloaded {}", id, requests);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static class ORCIDWorkerBuilder {
|
||||
|
||||
private String id;
|
||||
|
@ -206,7 +210,6 @@ public class ORCIDWorker extends Thread {
|
|||
return this;
|
||||
}
|
||||
|
||||
|
||||
public ORCIDWorkerBuilder withSummary(final SequenceFile.Writer sequenceFile) {
|
||||
this.summary = sequenceFile;
|
||||
return this;
|
||||
|
@ -227,16 +230,13 @@ public class ORCIDWorker extends Thread {
|
|||
return this;
|
||||
}
|
||||
|
||||
|
||||
public ORCIDWorker build() {
|
||||
if (this.summary== null || this.works==null || this.employments == null || StringUtils.isEmpty(token) || queue == null)
|
||||
if (this.summary == null || this.works == null || this.employments == null || StringUtils.isEmpty(token)
|
||||
|| queue == null)
|
||||
throw new RuntimeException("Unable to build missing required params");
|
||||
return new ORCIDWorker(id, queue, employments, summary, works, token);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.orcid;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
|
@ -23,17 +25,25 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
|
||||
public class OrcidGetUpdatesFile {
|
||||
|
||||
private static Logger log = LoggerFactory.getLogger(OrcidGetUpdatesFile.class);
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
"/eu/dnetlib/dhp/collection/orcid/download_orcid_update_parameter.json");
|
||||
IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
OrcidGetUpdatesFile.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/collection/orcid/download_orcid_update_parameter.json")))
|
||||
|
||||
);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String namenode = parser.get("namenode");
|
||||
log.info("got variable namenode: {}", namenode);
|
||||
|
@ -41,17 +51,17 @@ public class OrcidGetUpdatesFile {
|
|||
final String targetPath = parser.get("targetPath");
|
||||
log.info("got variable targetPath: {}", targetPath);
|
||||
|
||||
|
||||
final String apiURL = parser.get("apiURL");
|
||||
log.info("got variable apiURL: {}", apiURL);
|
||||
|
||||
final String accessToken = parser.get("accessToken");
|
||||
log.info("got variable accessToken: {}", accessToken);
|
||||
|
||||
|
||||
System.out.println("namenode = " + namenode);
|
||||
|
||||
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(namenode));
|
||||
|
||||
new OrcidGetUpdatesFile().readTar(fileSystem, accessToken, apiURL, targetPath, "2023-09-30");
|
||||
|
||||
}
|
||||
|
||||
|
@ -64,9 +74,10 @@ public class OrcidGetUpdatesFile {
|
|||
SequenceFile.Writer.valueClass(Text.class));
|
||||
}
|
||||
|
||||
|
||||
private ORCIDWorker createWorker(final String id, final String targetPath, final BlockingQueue<String> queue, final String accessToken, FileSystem fileSystem) throws Exception {
|
||||
return ORCIDWorker.builder()
|
||||
private ORCIDWorker createWorker(final String id, final String targetPath, final BlockingQueue<String> queue,
|
||||
final String accessToken, FileSystem fileSystem) throws Exception {
|
||||
return ORCIDWorker
|
||||
.builder()
|
||||
.withId(id)
|
||||
.withEmployments(createFile(new Path(String.format("%s/employments_%s", targetPath, id)), fileSystem))
|
||||
.withSummary(createFile(new Path(String.format("%s/summary_%s", targetPath, id)), fileSystem))
|
||||
|
@ -76,9 +87,8 @@ public class OrcidGetUpdatesFile {
|
|||
.build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void readTar(FileSystem fileSystem, final String accessToken, final String apiURL, final String targetPath, final String startDate ) throws Exception {
|
||||
public void readTar(FileSystem fileSystem, final String accessToken, final String apiURL, final String targetPath,
|
||||
final String startDate) throws Exception {
|
||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(apiURL).openConnection();
|
||||
final HttpClientParams clientParams = new HttpClientParams();
|
||||
urlConn.setInstanceFollowRedirects(false);
|
||||
|
@ -93,19 +103,23 @@ public class OrcidGetUpdatesFile {
|
|||
|
||||
BlockingQueue<String> queue = new ArrayBlockingQueue<String>(3000);
|
||||
final List<ORCIDWorker> workers = new ArrayList<>();
|
||||
for (int i = 0; i <20; i++) {
|
||||
for (int i = 0; i < 22; i++) {
|
||||
workers.add(createWorker("" + i, targetPath, queue, accessToken, fileSystem));
|
||||
}
|
||||
workers.forEach(Thread::start);
|
||||
|
||||
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
|
||||
if (entry.isFile()) {
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
|
||||
System.out.println(br.readLine());
|
||||
br.lines().map(l -> l.split(",")).filter(s -> StringUtils.compare(s[3].substring(0, 10), startDate) > 0).map(s->s[0]).limit(200).forEach(s -> {
|
||||
br
|
||||
.lines()
|
||||
.map(l -> l.split(","))
|
||||
.filter(s -> StringUtils.compare(s[3].substring(0, 10), startDate) > 0)
|
||||
.map(s -> s[0])
|
||||
.forEach(s -> {
|
||||
try {
|
||||
log.info("Adding item ");
|
||||
queue.put(s);
|
||||
|
@ -122,12 +136,7 @@ public class OrcidGetUpdatesFile {
|
|||
worker.join();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
[
|
||||
{
|
||||
[ {
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
"paramDescription": "the Name Node URI",
|
||||
|
@ -23,5 +22,4 @@
|
|||
"paramDescription": "the accessToken to contact API",
|
||||
"paramRequired": true
|
||||
}
|
||||
|
||||
]
|
|
@ -6,12 +6,12 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>apiURL</name>
|
||||
<description>The URL of the update CSV list </description>
|
||||
<value>http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar</value>
|
||||
<description>The URL of the update CSV list </description>
|
||||
</property>
|
||||
<property>
|
||||
<name>accessToken</name>
|
||||
<description>The access tocken</description>
|
||||
<description>The access token</description>
|
||||
|
||||
</property>
|
||||
</parameters>
|
||||
|
|
|
@ -124,16 +124,9 @@ public class DownloadORCIDTest {
|
|||
|
||||
// @Test
|
||||
// public void testReadTar() throws Exception {
|
||||
//// new OrcidGetUpdatesFile().readTar();
|
||||
//
|
||||
// Configuration conf = new Configuration();
|
||||
// FileSystem fs = FileSystem.get(URI.create("file:///"), conf);
|
||||
// final String token ="78fdb232-7105-4086-8570-e153f4198e3d";
|
||||
//
|
||||
// new OrcidGetUpdatesFile().readTar(fs,token, "http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar", "file:///Users/sandro/orcid","2023-09-30");
|
||||
//
|
||||
//
|
||||
//
|
||||
// OrcidGetUpdatesFile.main(new String[] {
|
||||
// "--namenode", "puppa"
|
||||
// });
|
||||
//
|
||||
// }
|
||||
|
||||
|
|
Loading…
Reference in New Issue