enrichment steps #38
|
@ -1,14 +1,14 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
|
@ -22,13 +22,15 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
|||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class OrcidDownloader extends OrcidDSManager {
|
||||
|
||||
static final int REQ_LIMIT = 24;
|
||||
static final int REQ_MAX_TEST = 100;
|
||||
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 50000;
|
||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss.";
|
||||
static final String lastUpdate = "2019-09-30 00:00:00.000000";
|
||||
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 10;
|
||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
static final String lastUpdate = "2019-09-30 00:00:00";
|
||||
private String lambdaFileName;
|
||||
private String outputPath;
|
||||
private String token;
|
||||
|
@ -39,21 +41,22 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
orcidDownloader.parseLambdaFile();
|
||||
}
|
||||
|
||||
private String downloadRecord(String orcidId) throws Exception {
|
||||
private String downloadRecord(String orcidId) {
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
if (response.getStatusLine().getStatusCode() != 200) {
|
||||
Log.warn(
|
||||
Log
|
||||
.warn(
|
||||
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||
return new String("");
|
||||
}
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
|
||||
} catch (Throwable e) {
|
||||
Log.warn("Downloading " + orcidId, e);
|
||||
Log.warn("Downloading " + orcidId, e.getMessage());
|
||||
}
|
||||
return new String("");
|
||||
}
|
||||
|
@ -68,15 +71,14 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
|
||||
Path hdfsreadpath = new Path(lambdaFileUri);
|
||||
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
||||
Path hdfsoutputPath =
|
||||
new Path(
|
||||
Path hdfsoutputPath = new Path(
|
||||
hdfsServerUri
|
||||
.concat(hdfsOrcidDefaultPath)
|
||||
.concat(outputPath)
|
||||
.concat("orcid_records.seq"));
|
||||
|
||||
try (SequenceFile.Writer writer =
|
||||
SequenceFile.createWriter(
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfsoutputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
|
@ -95,8 +97,9 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
}
|
||||
String[] values = line.split(",");
|
||||
List<String> recordInfo = Arrays.asList(values);
|
||||
if (isModified(recordInfo.get(3))) {
|
||||
String record = downloadRecord(recordInfo.get(0));
|
||||
String orcidId = recordInfo.get(0);
|
||||
if (isModified(orcidId, recordInfo.get(3))) {
|
||||
String record = downloadRecord(orcidId);
|
||||
downloadedRecordsCounter++;
|
||||
if (!record.isEmpty()) {
|
||||
String compressRecord = ArgumentApplicationParser.compressArgument(record);
|
||||
|
@ -107,8 +110,8 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
writer.append(key, value);
|
||||
savedRecordsCounter++;
|
||||
} catch (IOException e) {
|
||||
Log.debug("Writing to sequence file: " + e.getMessage());
|
||||
Log.debug(e);
|
||||
Log.warn("Writing to sequence file: " + e.getMessage());
|
||||
Log.warn(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
@ -118,7 +121,8 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
if (nReqTmp == REQ_LIMIT) {
|
||||
long reqSessionDuration = endReq - startReqTmp;
|
||||
if (reqSessionDuration <= 1000) {
|
||||
Log.warn(
|
||||
Log
|
||||
.warn(
|
||||
"\nreqSessionDuration: "
|
||||
+ reqSessionDuration
|
||||
+ " nReqTmp: "
|
||||
|
@ -131,13 +135,18 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
}
|
||||
}
|
||||
|
||||
// if (parsedRecordsCounter>REQ_MAX_TEST) {
|
||||
// break;
|
||||
// }
|
||||
if (parsedRecordsCounter > REQ_MAX_TEST) {
|
||||
break;
|
||||
}
|
||||
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||
Log.info("Current record parsed: " + parsedRecordsCounter);
|
||||
Log.info("Current record downloaded: " + downloadedRecordsCounter);
|
||||
Log.info("Current record saved: " + savedRecordsCounter);
|
||||
Log
|
||||
.info(
|
||||
"Current parsed: "
|
||||
+ parsedRecordsCounter
|
||||
+ " downloaded: "
|
||||
+ downloadedRecordsCounter
|
||||
+ " saved: "
|
||||
+ savedRecordsCounter);
|
||||
}
|
||||
}
|
||||
long endDownload = System.currentTimeMillis();
|
||||
|
@ -153,10 +162,11 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
}
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser =
|
||||
new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
OrcidDownloader.class.getResourceAsStream(
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
OrcidDownloader.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
|
@ -171,9 +181,20 @@ public class OrcidDownloader extends OrcidDSManager {
|
|||
token = parser.get("token");
|
||||
}
|
||||
|
||||
private boolean isModified(String modifiedDate) throws ParseException {
|
||||
Date modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
||||
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||
private boolean isModified(String orcidId, String modifiedDate) {
|
||||
Date modifiedDateDt = null;
|
||||
Date lastUpdateDt = null;
|
||||
try {
|
||||
if (modifiedDate.length() != 19) {
|
||||
modifiedDate = modifiedDate.substring(0, 19);
|
||||
}
|
||||
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
||||
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||
} catch (Exception e) {
|
||||
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
|
||||
return true;
|
||||
}
|
||||
|
||||
return modifiedDateDt.after(lastUpdateDt);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue