[ORCID-no-doi] integrating PR#98 D-Net/dnet-hadoop#98

This commit is contained in:
Claudio Atzori 2021-04-01 17:07:49 +02:00
parent 70e49ed53c
commit ee34cc51c3
45 changed files with 3844 additions and 675 deletions

View File

@ -0,0 +1,31 @@
diff a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java (rejected hunks)
@@ -1,8 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
-import eu.dnetlib.dhp.schema.common.ModelSupport;
-
import static com.google.common.base.Preconditions.checkArgument;
import java.text.ParseException;
@@ -10,6 +8,8 @@ import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import eu.dnetlib.dhp.schema.common.ModelSupport;
+
/**
* Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to
* graph node identifiers and it is further characterised by the semantic of the link through the fields relType,
@@ -137,7 +137,10 @@ public class Relation extends Oaf {
try {
setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate()));
} catch (ParseException e) {
- throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate()));
+ throw new IllegalArgumentException(String
+ .format(
+ "invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(),
+ getValidationDate()));
}
super.mergeFrom(r);

View File

@ -0,0 +1,79 @@
package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable;
public class AuthorHistory implements Serializable {
private String creationMethod;
private String completionDate;
private String submissionDate;
private String lastModifiedDate;
private boolean claimed;
private String deactivationDate;
private boolean verifiedEmail;
private boolean verifiedPrimaryEmail;
public String getCreationMethod() {
return creationMethod;
}
public void setCreationMethod(String creationMethod) {
this.creationMethod = creationMethod;
}
public String getCompletionDate() {
return completionDate;
}
public void setCompletionDate(String completionDate) {
this.completionDate = completionDate;
}
public String getSubmissionDate() {
return submissionDate;
}
public void setSubmissionDate(String submissionDate) {
this.submissionDate = submissionDate;
}
public String getLastModifiedDate() {
return lastModifiedDate;
}
public void setLastModifiedDate(String lastModifiedDate) {
this.lastModifiedDate = lastModifiedDate;
}
public boolean isClaimed() {
return claimed;
}
public void setClaimed(boolean claimed) {
this.claimed = claimed;
}
public String getDeactivationDate() {
return deactivationDate;
}
public void setDeactivationDate(String deactivationDate) {
this.deactivationDate = deactivationDate;
}
public boolean isVerifiedEmail() {
return verifiedEmail;
}
public void setVerifiedEmail(boolean verifiedEmail) {
this.verifiedEmail = verifiedEmail;
}
public boolean isVerifiedPrimaryEmail() {
return verifiedPrimaryEmail;
}
public void setVerifiedPrimaryEmail(boolean verifiedPrimaryEmail) {
this.verifiedPrimaryEmail = verifiedPrimaryEmail;
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable;
public class AuthorSummary extends OrcidData implements Serializable {
private AuthorData authorData;
private AuthorHistory authorHistory;
public AuthorData getAuthorData() {
return authorData;
}
public void setAuthorData(AuthorData authorData) {
this.authorData = authorData;
}
public AuthorHistory getAuthorHistory() {
return authorHistory;
}
public void setAuthorHistory(AuthorHistory authorHistory) {
this.authorHistory = authorHistory;
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.doiboost.orcidnodoi.model; package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable; import java.io.Serializable;
@ -12,9 +12,9 @@ import eu.dnetlib.dhp.schema.orcid.AuthorData;
public class Contributor extends AuthorData implements Serializable { public class Contributor extends AuthorData implements Serializable {
private String sequence; private String sequence;
private String role; private String role;
private transient boolean simpleMatch = false; private transient boolean simpleMatch;
private transient Double score = 0.0; private transient Double score;
private transient boolean bestMatch = false; private transient boolean bestMatch;
public String getSequence() { public String getSequence() {
return sequence; return sequence;

View File

@ -1,11 +1,13 @@
package eu.dnetlib.doiboost.orcidnodoi.model; package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable;
/** /**
* This class models the data related to external id, that are retrieved from an orcid publication * This class models the data related to external id, that are retrieved from an orcid publication
*/ */
public class ExternalId { public class ExternalId implements Serializable {
private String type; private String type;
private String value; private String value;
private String relationShip; private String relationShip;

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable;
public class OrcidData implements Serializable {
protected String base64CompressData;
protected String statusCode;
protected String downloadDate;
public String getBase64CompressData() {
return base64CompressData;
}
public void setBase64CompressData(String base64CompressData) {
this.base64CompressData = base64CompressData;
}
public String getStatusCode() {
return statusCode;
}
public void setStatusCode(String statusCode) {
this.statusCode = statusCode;
}
public String getDownloadDate() {
return downloadDate;
}
public void setDownloadDate(String downloadDate) {
this.downloadDate = downloadDate;
}
}

View File

@ -1,11 +1,13 @@
package eu.dnetlib.doiboost.orcidnodoi.model; package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable;
/** /**
* This class models the data related to a publication date, that are retrieved from an orcid publication * This class models the data related to a publication date, that are retrieved from an orcid publication
*/ */
public class PublicationDate { public class PublicationDate implements Serializable {
private String year; private String year;
private String month; private String month;
private String day; private String day;

View File

@ -0,0 +1,79 @@
package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable;
public class Summary implements Serializable {
private String creationMethod;
private String completionDate;
private String submissionDate;
private String lastModifiedDate;
private boolean claimed;
private String deactivationDate;
private boolean verifiedEmail;
private boolean verifiedPrimaryEmail;
public String getCreationMethod() {
return creationMethod;
}
public void setCreationMethod(String creationMethod) {
this.creationMethod = creationMethod;
}
public String getCompletionDate() {
return completionDate;
}
public void setCompletionDate(String completionDate) {
this.completionDate = completionDate;
}
public String getSubmissionDate() {
return submissionDate;
}
public void setSubmissionDate(String submissionDate) {
this.submissionDate = submissionDate;
}
public String getLastModifiedDate() {
return lastModifiedDate;
}
public void setLastModifiedDate(String lastModifiedDate) {
this.lastModifiedDate = lastModifiedDate;
}
public boolean isClaimed() {
return claimed;
}
public void setClaimed(boolean claimed) {
this.claimed = claimed;
}
public String getDeactivationDate() {
return deactivationDate;
}
public void setDeactivationDate(String deactivationDate) {
this.deactivationDate = deactivationDate;
}
public boolean isVerifiedEmail() {
return verifiedEmail;
}
public void setVerifiedEmail(boolean verifiedEmail) {
this.verifiedEmail = verifiedEmail;
}
public boolean isVerifiedPrimaryEmail() {
return verifiedPrimaryEmail;
}
public void setVerifiedPrimaryEmail(boolean verifiedPrimaryEmail) {
this.verifiedPrimaryEmail = verifiedPrimaryEmail;
}
}

View File

@ -0,0 +1,16 @@
package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable;
public class Work extends OrcidData implements Serializable {
WorkDetail workDetail;
public WorkDetail getWorkDetail() {
return workDetail;
}
public void setWorkDetail(WorkDetail workDetail) {
this.workDetail = workDetail;
}
}

View File

@ -1,14 +1,19 @@
package eu.dnetlib.doiboost.orcidnodoi.model; package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.schema.orcid.Contributor;
import eu.dnetlib.dhp.schema.orcid.ExternalId;
import eu.dnetlib.dhp.schema.orcid.OrcidData;
import eu.dnetlib.dhp.schema.orcid.PublicationDate;
/** /**
* This class models the data that are retrieved from orcid publication * This class models the data that are retrieved from orcid publication
*/ */
public class WorkDataNoDoi implements Serializable { public class WorkDetail implements Serializable {
private String oid; private String oid;
private String id; private String id;

View File

@ -1,208 +0,0 @@
package eu.dnetlib.doiboost.orcid;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class OrcidDownloader extends OrcidDSManager {
static final int REQ_LIMIT = 24;
static final int REQ_MAX_TEST = -1;
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 500;
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
static final String lastUpdate = "2020-09-29 00:00:00";
private String lambdaFileName;
private String outputPath;
private String token;
public static void main(String[] args) throws IOException, Exception {
OrcidDownloader orcidDownloader = new OrcidDownloader();
orcidDownloader.loadArgs(args);
orcidDownloader.parseLambdaFile();
}
private String downloadRecord(String orcidId) throws IOException {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
CloseableHttpResponse response = client.execute(httpGet);
if (response.getStatusLine().getStatusCode() != 200) {
Log
.info(
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
return new String("");
}
// return IOUtils.toString(response.getEntity().getContent());
return xmlStreamToString(response.getEntity().getContent());
}
}
private String xmlStreamToString(InputStream xmlStream) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(xmlStream));
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
return buffer.toString();
}
public void parseLambdaFile() throws Exception {
int parsedRecordsCounter = 0;
int downloadedRecordsCounter = 0;
int savedRecordsCounter = 0;
long startDownload = 0;
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
Path hdfsreadpath = new Path(lambdaFileUri);
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
Path hdfsoutputPath = new Path(
hdfsServerUri
.concat(workingPath)
.concat(outputPath)
.concat("updated_xml_authors.seq"));
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(lambdaFileStream))) {
TarArchiveEntry entry = null;
StringBuilder sb = new StringBuilder();
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(hdfsoutputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
startDownload = System.currentTimeMillis();
while ((entry = tais.getNextTarEntry()) != null) {
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
String line;
while ((line = br.readLine()) != null) {
String[] values = line.split(",");
List<String> recordInfo = Arrays.asList(values);
int nReqTmp = 0;
long startReqTmp = System.currentTimeMillis();
// skip headers line
if (parsedRecordsCounter == 0) {
parsedRecordsCounter++;
continue;
}
parsedRecordsCounter++;
String orcidId = recordInfo.get(0);
if (isModified(orcidId, recordInfo.get(3))) {
String record = downloadRecord(orcidId);
downloadedRecordsCounter++;
if (!record.isEmpty()) {
// String compressRecord = ArgumentApplicationParser.compressArgument(record);
final Text key = new Text(recordInfo.get(0));
final Text value = new Text(record);
writer.append(key, value);
savedRecordsCounter++;
}
} else {
break;
}
long endReq = System.currentTimeMillis();
nReqTmp++;
if (nReqTmp == REQ_LIMIT) {
long reqSessionDuration = endReq - startReqTmp;
if (reqSessionDuration <= 1000) {
Log
.info(
"\nreqSessionDuration: "
+ reqSessionDuration
+ " nReqTmp: "
+ nReqTmp
+ " wait ....");
Thread.sleep(1000 - reqSessionDuration);
} else {
nReqTmp = 0;
startReqTmp = System.currentTimeMillis();
}
}
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
Log
.info(
"Current parsed: "
+ parsedRecordsCounter
+ " downloaded: "
+ downloadedRecordsCounter
+ " saved: "
+ savedRecordsCounter);
if (REQ_MAX_TEST != -1 && parsedRecordsCounter > REQ_MAX_TEST) {
break;
}
}
}
long endDownload = System.currentTimeMillis();
long downloadTime = endDownload - startDownload;
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
}
}
}
Log.info("Download started at: " + new Date(startDownload).toString());
Log.info("Download ended at: " + new Date(System.currentTimeMillis()).toString());
Log.info("Parsed Records Counter: " + parsedRecordsCounter);
Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
Log.info("Saved Records Counter: " + savedRecordsCounter);
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
OrcidDownloader.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath");
Log.info("Default Path: " + workingPath);
lambdaFileName = parser.get("lambdaFileName");
Log.info("Lambda File Name: " + lambdaFileName);
outputPath = parser.get("outputPath");
Log.info("Output Data: " + outputPath);
token = parser.get("token");
}
public boolean isModified(String orcidId, String modifiedDate) {
Date modifiedDateDt = null;
Date lastUpdateDt = null;
try {
if (modifiedDate.length() != 19) {
modifiedDate = modifiedDate.substring(0, 19);
}
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
} catch (Exception e) {
Log.info("[" + orcidId + "] Parsing date: ", e.getMessage());
return true;
}
return modifiedDateDt.after(lastUpdateDt);
}
}

View File

@ -8,6 +8,7 @@ import java.util.Date;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
@ -24,13 +25,13 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData; import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
import scala.Tuple2; import scala.Tuple2;
public class SparkDownloadOrcidAuthors { public class SparkDownloadOrcidAuthors {
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class); static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
static final String lastUpdate = "2020-09-29 00:00:00";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -53,18 +54,25 @@ public class SparkDownloadOrcidAuthors {
final String token = parser.get("token"); final String token = parser.get("token");
final String lambdaFileName = parser.get("lambdaFileName"); final String lambdaFileName = parser.get("lambdaFileName");
logger.info("lambdaFileName: {}", lambdaFileName); logger.info("lambdaFileName: {}", lambdaFileName);
final String hdfsServerUri = parser.get("hdfsServerUri");
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
logger.info("lastUpdate: {}", lastUpdate);
if (StringUtils.isBlank(lastUpdate)) {
throw new RuntimeException("last update info not found");
}
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records"); LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records"); LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records");
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records"); LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403"); LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404");
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409"); LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503"); LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525"); LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
@ -73,13 +81,14 @@ public class SparkDownloadOrcidAuthors {
logger.info("Retrieving data from lamda sequence file"); logger.info("Retrieving data from lamda sequence file");
JavaPairRDD<Text, Text> lamdaFileRDD = sc JavaPairRDD<Text, Text> lamdaFileRDD = sc
.sequenceFile(workingPath + lambdaFileName, Text.class, Text.class); .sequenceFile(workingPath + lambdaFileName, Text.class, Text.class);
logger.info("Data retrieved: " + lamdaFileRDD.count()); final long lamdaFileRDDCount = lamdaFileRDD.count();
logger.info("Data retrieved: " + lamdaFileRDDCount);
Function<Tuple2<Text, Text>, Boolean> isModifiedAfterFilter = data -> { Function<Tuple2<Text, Text>, Boolean> isModifiedAfterFilter = data -> {
String orcidId = data._1().toString(); String orcidId = data._1().toString();
String lastModifiedDate = data._2().toString(); String lastModifiedDate = data._2().toString();
parsedRecordsAcc.add(1); parsedRecordsAcc.add(1);
if (isModified(orcidId, lastModifiedDate)) { if (isModified(orcidId, lastModifiedDate, lastUpdate)) {
modifiedRecordsAcc.add(1); modifiedRecordsAcc.add(1);
return true; return true;
} }
@ -92,49 +101,42 @@ public class SparkDownloadOrcidAuthors {
final DownloadedRecordData downloaded = new DownloadedRecordData(); final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId); downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastModifiedDate); downloaded.setLastModifiedDate(lastModifiedDate);
try (CloseableHttpClient client = HttpClients.createDefault()) { CloseableHttpClient client = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml"); httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", String.format("Bearer %s", token)); httpGet.addHeader("Authorization", String.format("Bearer %s", token));
long startReq = System.currentTimeMillis(); long startReq = System.currentTimeMillis();
CloseableHttpResponse response = client.execute(httpGet); CloseableHttpResponse response = client.execute(httpGet);
long endReq = System.currentTimeMillis(); long endReq = System.currentTimeMillis();
long reqTime = endReq - startReq; long reqTime = endReq - startReq;
if (reqTime < 1000) { if (reqTime < 1000) {
Thread.sleep(1000 - reqTime); Thread.sleep(1000 - reqTime);
}
int statusCode = response.getStatusLine().getStatusCode();
downloaded.setStatusCode(statusCode);
if (statusCode != 200) {
switch (statusCode) {
case 403:
errorHTTP403Acc.add(1);
case 404:
errorHTTP404Acc.add(1);
case 409:
errorHTTP409Acc.add(1);
case 503:
errorHTTP503Acc.add(1);
case 525:
errorHTTP525Acc.add(1);
default:
errorHTTPGenericAcc.add(1);
} }
int statusCode = response.getStatusLine().getStatusCode();
downloaded.setStatusCode(statusCode);
if (statusCode != 200) {
switch (statusCode) {
case 403:
errorHTTP403Acc.add(1);
case 409:
errorHTTP409Acc.add(1);
case 503:
errorHTTP503Acc.add(1);
throw new RuntimeException("Orcid request rate limit reached (HTTP 503)");
case 525:
errorHTTP525Acc.add(1);
default:
errorHTTPGenericAcc.add(1);
logger
.info(
"Downloading " + orcidId + " status code: "
+ response.getStatusLine().getStatusCode());
}
return downloaded.toTuple2();
}
downloadedRecordsAcc.add(1);
downloaded
.setCompressedData(
ArgumentApplicationParser
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
} catch (Throwable e) {
logger.info("Downloading " + orcidId, e.getMessage());
downloaded.setErrorMessage(e.getMessage());
return downloaded.toTuple2(); return downloaded.toTuple2();
} }
downloadedRecordsAcc.add(1);
downloaded
.setCompressedData(
ArgumentApplicationParser
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
client.close();
return downloaded.toTuple2(); return downloaded.toTuple2();
}; };
@ -142,10 +144,12 @@ public class SparkDownloadOrcidAuthors {
logger.info("Start execution ..."); logger.info("Start execution ...");
JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter); JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter);
logger.info("Authors modified count: " + authorsModifiedRDD.count()); long authorsModifiedCount = authorsModifiedRDD.count();
logger.info("Authors modified count: " + authorsModifiedCount);
logger.info("Start downloading ..."); logger.info("Start downloading ...");
authorsModifiedRDD authorsModifiedRDD
.repartition(10) .repartition(100)
.map(downloadRecordFunction) .map(downloadRecordFunction)
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsNewAPIHadoopFile( .saveAsNewAPIHadoopFile(
@ -154,10 +158,12 @@ public class SparkDownloadOrcidAuthors {
Text.class, Text.class,
SequenceFileOutputFormat.class, SequenceFileOutputFormat.class,
sc.hadoopConfiguration()); sc.hadoopConfiguration());
logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString()); logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString());
logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString()); logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString());
logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString()); logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString()); logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
logger.info("errorHTTP404Acc: " + errorHTTP404Acc.value().toString());
logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString()); logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString()); logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString()); logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
@ -166,18 +172,27 @@ public class SparkDownloadOrcidAuthors {
} }
private static boolean isModified(String orcidId, String modifiedDate) { public static boolean isModified(String orcidId, String modifiedDate, String lastUpdate) {
Date modifiedDateDt; Date modifiedDateDt;
Date lastUpdateDt; Date lastUpdateDt;
String lastUpdateRedux = "";
try { try {
if (modifiedDate.equals("last_modified")) {
return false;
}
if (modifiedDate.length() != 19) { if (modifiedDate.length() != 19) {
modifiedDate = modifiedDate.substring(0, 19); modifiedDate = modifiedDate.substring(0, 19);
} }
if (lastUpdate.length() != 19) {
lastUpdateRedux = lastUpdate.substring(0, 19);
} else {
lastUpdateRedux = lastUpdate;
}
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate); modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate); lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdateRedux);
} catch (Exception e) { } catch (Exception e) {
logger.info("[" + orcidId + "] Parsing date: ", e.getMessage()); throw new RuntimeException("[" + orcidId + "] modifiedDate <" + modifiedDate + "> lastUpdate <" + lastUpdate
return true; + "> Parsing date: " + e.getMessage());
} }
return modifiedDateDt.after(lastUpdateDt); return modifiedDateDt.after(lastUpdateDt);
} }

View File

@ -0,0 +1,30 @@
diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java (rejected hunks)
@@ -31,7 +32,6 @@ public class SparkDownloadOrcidAuthors {
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
- static String lastUpdate;
public static void main(String[] args) throws Exception {
@@ -54,14 +54,18 @@ public class SparkDownloadOrcidAuthors {
final String token = parser.get("token");
final String lambdaFileName = parser.get("lambdaFileName");
logger.info("lambdaFileName: {}", lambdaFileName);
-
- lastUpdate = HDFSUtil.readFromTextFile(workingPath.concat("last_update.txt"));
+ final String hdfsServerUri = parser.get("hdfsServerUri");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
+ String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
+ logger.info("lastUpdate: ", lastUpdate);
+ if (StringUtils.isBlank(lastUpdate)) {
+ throw new RuntimeException("last update info not found");
+ }
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");

View File

@ -0,0 +1,251 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
import scala.Tuple2;
public class SparkDownloadOrcidWorks {
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidWorks.class);
public static final String LAMBDA_FILE_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
public static final DateTimeFormatter LAMBDA_FILE_DATE_FORMATTER = DateTimeFormatter
.ofPattern(LAMBDA_FILE_DATE_FORMAT);
public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter
.ofPattern(ORCID_XML_DATETIME_FORMAT);
public static void main(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkDownloadOrcidWorks.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
logger.info("workingPath: ", workingPath);
final String outputPath = parser.get("outputPath");
final String token = parser.get("token");
final String hdfsServerUri = parser.get("hdfsServerUri");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
logger.info("lastUpdateValue: ", lastUpdateValue);
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors");
LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors");
LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works");
LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works");
LongAccumulator maxModifiedWorksLimitAcc = spark
.sparkContext()
.longAccumulator("max_modified_works_limit");
LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found");
LongAccumulator errorLoadingJsonFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_json_found");
LongAccumulator errorLoadingXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_xml_found");
LongAccumulator errorParsingXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_parsing_xml_found");
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404");
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
updatedAuthorsAcc.setValue(updatedAuthorsRDD.count());
FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> {
String orcidId = data._1().toString();
String jsonData = data._2().toString();
List<String> workIds = new ArrayList<>();
Map<String, String> workIdLastModifiedDate = new HashMap<>();
JsonElement jElement = new JsonParser().parse(jsonData);
String statusCode = getJsonValue(jElement, "statusCode");
if (statusCode.equals("200")) {
String compressedData = getJsonValue(jElement, "compressedData");
if (StringUtils.isEmpty(compressedData)) {
errorLoadingJsonFoundAcc.add(1);
} else {
String authorSummary = ArgumentApplicationParser.decompressValue(compressedData);
if (StringUtils.isEmpty(authorSummary)) {
errorLoadingXMLFoundAcc.add(1);
} else {
try {
workIdLastModifiedDate = XMLRecordParser
.retrieveWorkIdLastModifiedDate(authorSummary.getBytes());
} catch (Exception e) {
logger.error("parsing " + orcidId + " [" + jsonData + "]", e);
errorParsingXMLFoundAcc.add(1);
}
}
}
} else {
errorCodeFoundAcc.add(1);
}
parsedAuthorsAcc.add(1);
workIdLastModifiedDate.forEach((k, v) -> {
parsedWorksAcc.add(1);
if (isModified(orcidId, v, lastUpdateValue)) {
modifiedWorksAcc.add(1);
workIds.add(orcidId.concat("/work/").concat(k));
}
});
if (workIdLastModifiedDate.size() > 50) {
maxModifiedWorksLimitAcc.add(1);
}
return workIds.iterator();
};
Function<String, Tuple2<String, String>> downloadWorkFunction = data -> {
String relativeWorkUrl = data;
String orcidId = relativeWorkUrl.split("/")[0];
final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastUpdateValue);
CloseableHttpClient client = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + relativeWorkUrl);
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
long startReq = System.currentTimeMillis();
CloseableHttpResponse response = client.execute(httpGet);
long endReq = System.currentTimeMillis();
long reqTime = endReq - startReq;
if (reqTime < 1000) {
Thread.sleep(1000 - reqTime);
}
int statusCode = response.getStatusLine().getStatusCode();
downloaded.setStatusCode(statusCode);
if (statusCode != 200) {
switch (statusCode) {
case 403:
errorHTTP403Acc.add(1);
case 404:
errorHTTP404Acc.add(1);
case 409:
errorHTTP409Acc.add(1);
case 503:
errorHTTP503Acc.add(1);
case 525:
errorHTTP525Acc.add(1);
default:
errorHTTPGenericAcc.add(1);
logger
.info(
"Downloading " + orcidId + " status code: "
+ response.getStatusLine().getStatusCode());
}
return downloaded.toTuple2();
}
downloadedRecordsAcc.add(1);
downloaded
.setCompressedData(
ArgumentApplicationParser
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
client.close();
return downloaded.toTuple2();
};
updatedAuthorsRDD
.flatMap(retrieveWorkUrlFunction)
.repartition(100)
.map(downloadWorkFunction)
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class);
logger.info("updatedAuthorsAcc: " + updatedAuthorsAcc.value().toString());
logger.info("parsedAuthorsAcc: " + parsedAuthorsAcc.value().toString());
logger.info("parsedWorksAcc: " + parsedWorksAcc.value().toString());
logger.info("modifiedWorksAcc: " + modifiedWorksAcc.value().toString());
logger.info("maxModifiedWorksLimitAcc: " + maxModifiedWorksLimitAcc.value().toString());
logger.info("errorCodeFoundAcc: " + errorCodeFoundAcc.value().toString());
logger.info("errorLoadingJsonFoundAcc: " + errorLoadingJsonFoundAcc.value().toString());
logger.info("errorLoadingXMLFoundAcc: " + errorLoadingXMLFoundAcc.value().toString());
logger.info("errorParsingXMLFoundAcc: " + errorParsingXMLFoundAcc.value().toString());
logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
logger.info("errorHTTPGenericAcc: " + errorHTTPGenericAcc.value().toString());
});
}
public static boolean isModified(String orcidId, String modifiedDateValue, String lastUpdateValue) {
LocalDate modifiedDate = null;
LocalDate lastUpdate = null;
try {
modifiedDate = LocalDate.parse(modifiedDateValue, SparkDownloadOrcidWorks.ORCID_XML_DATETIMEFORMATTER);
if (lastUpdateValue.length() != 19) {
lastUpdateValue = lastUpdateValue.substring(0, 19);
}
lastUpdate = LocalDate
.parse(lastUpdateValue, SparkDownloadOrcidWorks.LAMBDA_FILE_DATE_FORMATTER);
} catch (Exception e) {
logger.info("[" + orcidId + "] Parsing date: ", e.getMessage());
throw new RuntimeException("[" + orcidId + "] Parsing date: " + e.getMessage());
}
return modifiedDate.isAfter(lastUpdate);
}
private static String getJsonValue(JsonElement jElement, String property) {
if (jElement.getAsJsonObject().has(property)) {
JsonElement name = null;
name = jElement.getAsJsonObject().get(property);
if (name != null && !name.isJsonNull()) {
return name.getAsString();
}
}
return new String("");
}
}

View File

@ -3,9 +3,7 @@ package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.BufferedReader; import java.io.*;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI; import java.net.URI;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
@ -17,6 +15,7 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile;
@ -26,6 +25,7 @@ import org.apache.spark.SparkConf;
import org.mortbay.log.Log; import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
public class SparkGenLastModifiedSeq { public class SparkGenLastModifiedSeq {
private static String hdfsServerUri; private static String hdfsServerUri;
@ -50,6 +50,7 @@ public class SparkGenLastModifiedSeq {
outputPath = parser.get("outputPath"); outputPath = parser.get("outputPath");
lambdaFileName = parser.get("lambdaFileName"); lambdaFileName = parser.get("lambdaFileName");
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName); String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
String lastModifiedDateFromLambdaFileUri = "last_modified_date_from_lambda_file.txt";
SparkConf sparkConf = new SparkConf(); SparkConf sparkConf = new SparkConf();
runWithSparkSession( runWithSparkSession(
@ -57,6 +58,7 @@ public class SparkGenLastModifiedSeq {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
int rowsNum = 0; int rowsNum = 0;
String lastModifiedAuthorDate = "";
Path output = new Path( Path output = new Path(
hdfsServerUri hdfsServerUri
.concat(workingPath) .concat(workingPath)
@ -89,10 +91,17 @@ public class SparkGenLastModifiedSeq {
final Text value = new Text(recordInfo.get(3)); final Text value = new Text(recordInfo.get(3));
writer.append(key, value); writer.append(key, value);
rowsNum++; rowsNum++;
if (rowsNum == 2) {
lastModifiedAuthorDate = value.toString();
}
} }
} }
} }
} }
HDFSUtil
.writeToTextFile(
hdfsServerUri, workingPath, lastModifiedDateFromLambdaFileUri, lastModifiedAuthorDate);
Log.info("Saved rows from lamda csv tar file: " + rowsNum); Log.info("Saved rows from lamda csv tar file: " + rowsNum);
}); });
} }

View File

@ -4,15 +4,13 @@ package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.*;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -25,13 +23,15 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.esotericsoftware.minlog.Log; import com.esotericsoftware.minlog.Log;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.JsonElement; import com.google.gson.JsonElement;
import com.google.gson.JsonParser; import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.dhp.schema.orcid.OrcidDOI;
import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.model.WorkData;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
import scala.Tuple2; import scala.Tuple2;
public class SparkGenerateDoiAuthorList { public class SparkGenerateDoiAuthorList {
@ -56,6 +56,10 @@ public class SparkGenerateDoiAuthorList {
logger.info("workingPath: ", workingPath); logger.info("workingPath: ", workingPath);
final String outputDoiAuthorListPath = parser.get("outputDoiAuthorListPath"); final String outputDoiAuthorListPath = parser.get("outputDoiAuthorListPath");
logger.info("outputDoiAuthorListPath: ", outputDoiAuthorListPath); logger.info("outputDoiAuthorListPath: ", outputDoiAuthorListPath);
final String authorsPath = parser.get("authorsPath");
logger.info("authorsPath: ", authorsPath);
final String xmlWorksPath = parser.get("xmlWorksPath");
logger.info("xmlWorksPath: ", xmlWorksPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
@ -65,17 +69,21 @@ public class SparkGenerateDoiAuthorList {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc JavaPairRDD<Text, Text> summariesRDD = sc
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class); .sequenceFile(workingPath.concat(authorsPath), Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark Dataset<AuthorData> summariesDataset = spark
.createDataset( .createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
Encoders.bean(AuthorData.class)); Encoders.bean(AuthorData.class));
JavaPairRDD<Text, Text> activitiesRDD = sc JavaPairRDD<Text, Text> xmlWorksRDD = sc
.sequenceFile(workingPath + "/output/*.seq", Text.class, Text.class); .sequenceFile(workingPath.concat(xmlWorksPath), Text.class, Text.class);
Dataset<WorkData> activitiesDataset = spark Dataset<WorkData> activitiesDataset = spark
.createDataset( .createDataset(
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), xmlWorksRDD
.map(seq -> XMLRecordParser.VTDParseWorkData(seq._2().toString().getBytes()))
.filter(work -> work != null && work.getErrorCode() == null && work.isDoiFound())
.rdd(),
Encoders.bean(WorkData.class)); Encoders.bean(WorkData.class));
Function<Tuple2<String, AuthorData>, Tuple2<String, List<AuthorData>>> toAuthorListFunction = data -> { Function<Tuple2<String, AuthorData>, Tuple2<String, List<AuthorData>>> toAuthorListFunction = data -> {
@ -135,13 +143,19 @@ public class SparkGenerateDoiAuthorList {
} }
return null; return null;
}) })
.mapToPair( .mapToPair(s -> {
s -> { List<AuthorData> authorList = s._2();
ObjectMapper mapper = new ObjectMapper(); Set<String> oidsAlreadySeen = new HashSet<>();
return new Tuple2<>(s._1(), mapper.writeValueAsString(s._2())); authorList.removeIf(a -> !oidsAlreadySeen.add(a.getOid()));
}) return new Tuple2<>(s._1(), authorList);
.repartition(10) })
.saveAsTextFile(workingPath + outputDoiAuthorListPath); .map(s -> {
OrcidDOI orcidDOI = new OrcidDOI();
orcidDOI.setDoi(s._1());
orcidDOI.setAuthors(s._2());
return JsonWriter.create(orcidDOI);
})
.saveAsTextFile(workingPath + outputDoiAuthorListPath, GzipCodec.class);
}); });
} }

View File

@ -0,0 +1,242 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.*;
import java.io.IOException;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
import scala.Tuple2;
public class SparkUpdateOrcidAuthors {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidAuthors.class);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkUpdateOrcidAuthors.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String workingPath = parser.get("workingPath");
// final String outputPath = parser.get("outputPath");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator oldAuthorsFoundAcc = spark
.sparkContext()
.longAccumulator("old_authors_found");
LongAccumulator updatedAuthorsFoundAcc = spark
.sparkContext()
.longAccumulator("updated_authors_found");
LongAccumulator newAuthorsFoundAcc = spark
.sparkContext()
.longAccumulator("new_authors_found");
LongAccumulator errorCodeAuthorsFoundAcc = spark
.sparkContext()
.longAccumulator("error_code_authors_found");
LongAccumulator errorLoadingAuthorsJsonFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_authors_json_found");
LongAccumulator errorParsingAuthorsXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_parsing_authors_xml_found");
Function<Tuple2<Text, Text>, AuthorSummary> retrieveAuthorSummaryFunction = data -> {
AuthorSummary authorSummary = new AuthorSummary();
String orcidId = data._1().toString();
String jsonData = data._2().toString();
JsonElement jElement = new JsonParser().parse(jsonData);
String statusCode = getJsonValue(jElement, "statusCode");
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
if (statusCode.equals("200")) {
String compressedData = getJsonValue(jElement, "compressedData");
if (StringUtils.isEmpty(compressedData)) {
errorLoadingAuthorsJsonFoundAcc.add(1);
} else {
String xmlAuthor = ArgumentApplicationParser.decompressValue(compressedData);
try {
authorSummary = XMLRecordParser
.VTDParseAuthorSummary(xmlAuthor.getBytes());
authorSummary.setStatusCode(statusCode);
authorSummary.setDownloadDate(Long.toString(System.currentTimeMillis()));
authorSummary.setBase64CompressData(compressedData);
return authorSummary;
} catch (Exception e) {
logger.error("parsing xml " + orcidId + " [" + jsonData + "]", e);
errorParsingAuthorsXMLFoundAcc.add(1);
}
}
} else {
authorSummary.setStatusCode(statusCode);
authorSummary.setDownloadDate(Long.toString(System.currentTimeMillis()));
errorCodeAuthorsFoundAcc.add(1);
}
return authorSummary;
};
Dataset<AuthorSummary> downloadedAuthorSummaryDS = spark
.createDataset(
sc
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class)
.map(retrieveAuthorSummaryFunction)
.rdd(),
Encoders.bean(AuthorSummary.class));
Dataset<AuthorSummary> currentAuthorSummaryDS = spark
.createDataset(
sc
.textFile(workingPath.concat("orcid_dataset/authors/*"))
.map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
.rdd(),
Encoders.bean(AuthorSummary.class));
Dataset<AuthorSummary> mergedAuthorSummaryDS = currentAuthorSummaryDS
.joinWith(
downloadedAuthorSummaryDS,
currentAuthorSummaryDS
.col("authorData.oid")
.equalTo(downloadedAuthorSummaryDS.col("authorData.oid")),
"full_outer")
.map(value -> {
Optional<AuthorSummary> opCurrent = Optional.ofNullable(value._1());
Optional<AuthorSummary> opDownloaded = Optional.ofNullable(value._2());
if (!opCurrent.isPresent()) {
newAuthorsFoundAcc.add(1);
return opDownloaded.get();
}
if (!opDownloaded.isPresent()) {
oldAuthorsFoundAcc.add(1);
return opCurrent.get();
}
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
updatedAuthorsFoundAcc.add(1);
return opDownloaded.get();
}
return null;
},
Encoders.bean(AuthorSummary.class))
.filter(Objects::nonNull);
long mergedCount = mergedAuthorSummaryDS.count();
Dataset<AuthorSummary> base64DedupedDS = mergedAuthorSummaryDS.dropDuplicates("base64CompressData");
List<String> dupOids = base64DedupedDS
.groupBy("authorData.oid")
.agg(count("authorData.oid").alias("oidOccurrenceCount"))
.where("oidOccurrenceCount > 1")
.select("oid")
.toJavaRDD()
.map(row -> row.get(0).toString())
.collect();
JavaRDD<AuthorSummary> dupAuthors = base64DedupedDS
.toJavaRDD()
.filter(
authorSummary -> (Objects.nonNull(authorSummary.getAuthorData())
&& Objects.nonNull(authorSummary.getAuthorData().getOid())))
.filter(authorSummary -> dupOids.contains(authorSummary.getAuthorData().getOid()));
Dataset<AuthorSummary> dupAuthorSummaryDS = spark
.createDataset(
dupAuthors.rdd(),
Encoders.bean(AuthorSummary.class));
List<Tuple2<String, String>> lastModifiedAuthors = dupAuthorSummaryDS
.groupBy("authorData.oid")
.agg(array_max(collect_list("downloadDate")))
.map(
row -> new Tuple2<>(row.get(0).toString(), row.get(1).toString()),
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.toJavaRDD()
.collect();
JavaRDD<AuthorSummary> lastDownloadedAuthors = base64DedupedDS
.toJavaRDD()
.filter(
authorSummary -> (Objects.nonNull(authorSummary.getAuthorData())
&& Objects.nonNull(authorSummary.getAuthorData().getOid())))
.filter(authorSummary -> {
boolean oidFound = lastModifiedAuthors
.stream()
.filter(a -> a._1().equals(authorSummary.getAuthorData().getOid()))
.count() == 1;
boolean tsFound = lastModifiedAuthors
.stream()
.filter(
a -> a._1().equals(authorSummary.getAuthorData().getOid()) &&
a._2().equals(authorSummary.getDownloadDate()))
.count() == 1;
return (oidFound && tsFound) || (!oidFound);
});
Dataset<AuthorSummary> cleanedDS = spark
.createDataset(
lastDownloadedAuthors.rdd(),
Encoders.bean(AuthorSummary.class))
.dropDuplicates("downloadDate", "authorData");
cleanedDS
.toJavaRDD()
.map(authorSummary -> OBJECT_MAPPER.writeValueAsString(authorSummary))
.saveAsTextFile(workingPath.concat("orcid_dataset/new_authors"), GzipCodec.class);
long cleanedDSCount = cleanedDS.count();
logger.info("report_oldAuthorsFoundAcc: " + oldAuthorsFoundAcc.value().toString());
logger.info("report_newAuthorsFoundAcc: " + newAuthorsFoundAcc.value().toString());
logger.info("report_updatedAuthorsFoundAcc: " + updatedAuthorsFoundAcc.value().toString());
logger.info("report_errorCodeFoundAcc: " + errorCodeAuthorsFoundAcc.value().toString());
logger.info("report_errorLoadingJsonFoundAcc: " + errorLoadingAuthorsJsonFoundAcc.value().toString());
logger.info("report_errorParsingXMLFoundAcc: " + errorParsingAuthorsXMLFoundAcc.value().toString());
logger.info("report_merged_count: " + mergedCount);
logger.info("report_cleaned_count: " + cleanedDSCount);
});
}
private static String getJsonValue(JsonElement jElement, String property) {
if (jElement.getAsJsonObject().has(property)) {
JsonElement name = null;
name = jElement.getAsJsonObject().get(property);
if (name != null && !name.isJsonNull()) {
return name.getAsString();
}
}
return "";
}
}

View File

@ -0,0 +1,317 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
import eu.dnetlib.dhp.schema.orcid.Work;
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
import scala.Tuple2;
public class SparkUpdateOrcidDatasets {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidDatasets.class);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkUpdateOrcidDatasets.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String workingPath = parser.get("workingPath");
// final String outputPath = parser.get("outputPath");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator oldAuthorsFoundAcc = spark
.sparkContext()
.longAccumulator("old_authors_found");
LongAccumulator updatedAuthorsFoundAcc = spark
.sparkContext()
.longAccumulator("updated_authors_found");
LongAccumulator newAuthorsFoundAcc = spark
.sparkContext()
.longAccumulator("new_authors_found");
LongAccumulator errorCodeAuthorsFoundAcc = spark
.sparkContext()
.longAccumulator("error_code_authors_found");
LongAccumulator errorLoadingAuthorsJsonFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_authors_json_found");
LongAccumulator errorParsingAuthorsXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_parsing_authors_xml_found");
LongAccumulator oldWorksFoundAcc = spark
.sparkContext()
.longAccumulator("old_works_found");
LongAccumulator updatedWorksFoundAcc = spark
.sparkContext()
.longAccumulator("updated_works_found");
LongAccumulator newWorksFoundAcc = spark
.sparkContext()
.longAccumulator("new_works_found");
LongAccumulator errorCodeWorksFoundAcc = spark
.sparkContext()
.longAccumulator("error_code_works_found");
LongAccumulator errorLoadingWorksJsonFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_works_json_found");
LongAccumulator errorParsingWorksXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_parsing_works_xml_found");
// JavaPairRDD<Text, Text> xmlSummariesRDD = sc
// .sequenceFile(workingPath.concat("xml/authors/xml_authors.seq"), Text.class, Text.class);
// xmlSummariesRDD
// .map(seq -> {
// AuthorSummary authorSummary = XMLRecordParser
// .VTDParseAuthorSummary(seq._2().toString().getBytes());
// authorSummary
// .setBase64CompressData(ArgumentApplicationParser.compressArgument(seq._2().toString()));
// return authorSummary;
// })
// .filter(authorSummary -> authorSummary != null)
// .map(authorSummary -> JsonWriter.create(authorSummary))
// .saveAsTextFile(workingPath.concat("orcid_dataset/authors"), GzipCodec.class);
//
// JavaPairRDD<Text, Text> xmlWorksRDD = sc
// .sequenceFile(workingPath.concat("xml/works/*"), Text.class, Text.class);
//
// xmlWorksRDD
// .map(seq -> {
// WorkDetail workDetail = XMLRecordParserNoDoi.VTDParseWorkData(seq._2().toString().getBytes());
// Work work = new Work();
// work.setWorkDetail(workDetail);
// work.setBase64CompressData(ArgumentApplicationParser.compressArgument(seq._2().toString()));
// return work;
// })
// .filter(work -> work != null)
// .map(work -> JsonWriter.create(work))
// .saveAsTextFile(workingPath.concat("orcid_dataset/works"), GzipCodec.class);
// Function<Tuple2<Text, Text>, AuthorSummary> retrieveAuthorSummaryFunction = data -> {
// AuthorSummary authorSummary = new AuthorSummary();
// String orcidId = data._1().toString();
// String jsonData = data._2().toString();
// JsonElement jElement = new JsonParser().parse(jsonData);
// String statusCode = getJsonValue(jElement, "statusCode");
// String downloadDate = getJsonValue(jElement, "lastModifiedDate");
// if (statusCode.equals("200")) {
// String compressedData = getJsonValue(jElement, "compressedData");
// if (StringUtils.isEmpty(compressedData)) {
// errorLoadingAuthorsJsonFoundAcc.add(1);
// } else {
// String xmlAuthor = ArgumentApplicationParser.decompressValue(compressedData);
// try {
// authorSummary = XMLRecordParser
// .VTDParseAuthorSummary(xmlAuthor.getBytes());
// authorSummary.setStatusCode(statusCode);
// authorSummary.setDownloadDate("2020-11-18 00:00:05.644768");
// authorSummary.setBase64CompressData(compressedData);
// return authorSummary;
// } catch (Exception e) {
// logger.error("parsing xml " + orcidId + " [" + jsonData + "]", e);
// errorParsingAuthorsXMLFoundAcc.add(1);
// }
// }
// } else {
// authorSummary.setStatusCode(statusCode);
// authorSummary.setDownloadDate("2020-11-18 00:00:05.644768");
// errorCodeAuthorsFoundAcc.add(1);
// }
// return authorSummary;
// };
//
// Dataset<AuthorSummary> downloadedAuthorSummaryDS = spark
// .createDataset(
// sc
// .sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class)
// .map(retrieveAuthorSummaryFunction)
// .rdd(),
// Encoders.bean(AuthorSummary.class));
// Dataset<AuthorSummary> currentAuthorSummaryDS = spark
// .createDataset(
// sc
// .textFile(workingPath.concat("orcid_dataset/authors/*"))
// .map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
// .rdd(),
// Encoders.bean(AuthorSummary.class));
// currentAuthorSummaryDS
// .joinWith(
// downloadedAuthorSummaryDS,
// currentAuthorSummaryDS
// .col("authorData.oid")
// .equalTo(downloadedAuthorSummaryDS.col("authorData.oid")),
// "full_outer")
// .map(value -> {
// Optional<AuthorSummary> opCurrent = Optional.ofNullable(value._1());
// Optional<AuthorSummary> opDownloaded = Optional.ofNullable(value._2());
// if (!opCurrent.isPresent()) {
// newAuthorsFoundAcc.add(1);
// return opDownloaded.get();
// }
// if (!opDownloaded.isPresent()) {
// oldAuthorsFoundAcc.add(1);
// return opCurrent.get();
// }
// if (opCurrent.isPresent() && opDownloaded.isPresent()) {
// updatedAuthorsFoundAcc.add(1);
// return opDownloaded.get();
// }
// return null;
// },
// Encoders.bean(AuthorSummary.class))
// .filter(Objects::nonNull)
// .toJavaRDD()
// .map(authorSummary -> OBJECT_MAPPER.writeValueAsString(authorSummary))
// .saveAsTextFile(workingPath.concat("orcid_dataset/new_authors"), GzipCodec.class);
//
// logger.info("oldAuthorsFoundAcc: " + oldAuthorsFoundAcc.value().toString());
// logger.info("newAuthorsFoundAcc: " + newAuthorsFoundAcc.value().toString());
// logger.info("updatedAuthorsFoundAcc: " + updatedAuthorsFoundAcc.value().toString());
// logger.info("errorCodeFoundAcc: " + errorCodeAuthorsFoundAcc.value().toString());
// logger.info("errorLoadingJsonFoundAcc: " + errorLoadingAuthorsJsonFoundAcc.value().toString());
// logger.info("errorParsingXMLFoundAcc: " + errorParsingAuthorsXMLFoundAcc.value().toString());
Function<String, Work> retrieveWorkFunction = jsonData -> {
Work work = new Work();
JsonElement jElement = new JsonParser().parse(jsonData);
String statusCode = getJsonValue(jElement, "statusCode");
work.setStatusCode(statusCode);
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
work.setDownloadDate("2020-11-18 00:00:05.644768");
if (statusCode.equals("200")) {
String compressedData = getJsonValue(jElement, "compressedData");
if (StringUtils.isEmpty(compressedData)) {
errorLoadingWorksJsonFoundAcc.add(1);
} else {
String xmlWork = ArgumentApplicationParser.decompressValue(compressedData);
try {
WorkDetail workDetail = XMLRecordParserNoDoi
.VTDParseWorkData(xmlWork.getBytes());
work.setWorkDetail(workDetail);
work.setBase64CompressData(compressedData);
return work;
} catch (Exception e) {
logger.error("parsing xml [" + jsonData + "]", e);
errorParsingWorksXMLFoundAcc.add(1);
}
}
} else {
errorCodeWorksFoundAcc.add(1);
}
return work;
};
Dataset<Work> downloadedWorksDS = spark
.createDataset(
sc
.textFile(workingPath + "downloads/updated_works/*")
.map(s -> {
return s.substring(21, s.length() - 1);
})
.map(retrieveWorkFunction)
.rdd(),
Encoders.bean(Work.class));
Dataset<Work> currentWorksDS = spark
.createDataset(
sc
.textFile(workingPath.concat("orcid_dataset/works/*"))
.map(item -> OBJECT_MAPPER.readValue(item, Work.class))
.rdd(),
Encoders.bean(Work.class));
currentWorksDS
.joinWith(
downloadedWorksDS,
currentWorksDS
.col("workDetail.id")
.equalTo(downloadedWorksDS.col("workDetail.id"))
.and(
currentWorksDS
.col("workDetail.oid")
.equalTo(downloadedWorksDS.col("workDetail.oid"))),
"full_outer")
.map(value -> {
Optional<Work> opCurrent = Optional.ofNullable(value._1());
Optional<Work> opDownloaded = Optional.ofNullable(value._2());
if (!opCurrent.isPresent()) {
newWorksFoundAcc.add(1);
return opDownloaded.get();
}
if (!opDownloaded.isPresent()) {
oldWorksFoundAcc.add(1);
return opCurrent.get();
}
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
updatedWorksFoundAcc.add(1);
return opDownloaded.get();
}
return null;
},
Encoders.bean(Work.class))
.filter(Objects::nonNull)
.toJavaRDD()
.map(work -> OBJECT_MAPPER.writeValueAsString(work))
.saveAsTextFile(workingPath.concat("orcid_dataset/new_works"), GzipCodec.class);
logger.info("oldWorksFoundAcc: " + oldWorksFoundAcc.value().toString());
logger.info("newWorksFoundAcc: " + newWorksFoundAcc.value().toString());
logger.info("updatedWorksFoundAcc: " + updatedWorksFoundAcc.value().toString());
logger.info("errorCodeWorksFoundAcc: " + errorCodeWorksFoundAcc.value().toString());
logger.info("errorLoadingJsonWorksFoundAcc: " + errorLoadingWorksJsonFoundAcc.value().toString());
logger.info("errorParsingXMLWorksFoundAcc: " + errorParsingWorksXMLFoundAcc.value().toString());
});
}
private static String getJsonValue(JsonElement jElement, String property) {
if (jElement.getAsJsonObject().has(property)) {
JsonElement name = null;
name = jElement.getAsJsonObject().get(property);
if (name != null && !name.isJsonNull()) {
return name.getAsString();
}
}
return "";
}
}

View File

@ -0,0 +1,186 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.orcid.Work;
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
public class SparkUpdateOrcidWorks {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidWorks.class);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkUpdateOrcidWorks.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String workingPath = parser.get("workingPath");
final String hdfsServerUri = parser.get("hdfsServerUri");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator oldWorksFoundAcc = spark
.sparkContext()
.longAccumulator("old_works_found");
LongAccumulator updatedWorksFoundAcc = spark
.sparkContext()
.longAccumulator("updated_works_found");
LongAccumulator newWorksFoundAcc = spark
.sparkContext()
.longAccumulator("new_works_found");
LongAccumulator errorCodeWorksFoundAcc = spark
.sparkContext()
.longAccumulator("error_code_works_found");
LongAccumulator errorLoadingWorksJsonFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_works_json_found");
LongAccumulator errorParsingWorksXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_parsing_works_xml_found");
Function<String, Work> retrieveWorkFunction = jsonData -> {
Work work = new Work();
JsonElement jElement = new JsonParser().parse(jsonData);
String statusCode = getJsonValue(jElement, "statusCode");
work.setStatusCode(statusCode);
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
work.setDownloadDate(Long.toString(System.currentTimeMillis()));
if (statusCode.equals("200")) {
String compressedData = getJsonValue(jElement, "compressedData");
if (StringUtils.isEmpty(compressedData)) {
errorLoadingWorksJsonFoundAcc.add(1);
} else {
String xmlWork = ArgumentApplicationParser.decompressValue(compressedData);
try {
WorkDetail workDetail = XMLRecordParserNoDoi
.VTDParseWorkData(xmlWork.getBytes());
work.setWorkDetail(workDetail);
work.setBase64CompressData(compressedData);
return work;
} catch (Exception e) {
logger.error("parsing xml [" + jsonData + "]", e);
errorParsingWorksXMLFoundAcc.add(1);
}
}
} else {
errorCodeWorksFoundAcc.add(1);
}
return work;
};
Dataset<Work> downloadedWorksDS = spark
.createDataset(
sc
.textFile(workingPath + "downloads/updated_works/*")
.map(s -> {
return s.substring(21, s.length() - 1);
})
.map(retrieveWorkFunction)
.rdd(),
Encoders.bean(Work.class));
Dataset<Work> currentWorksDS = spark
.createDataset(
sc
.textFile(workingPath.concat("orcid_dataset/works/*"))
.map(item -> OBJECT_MAPPER.readValue(item, Work.class))
.rdd(),
Encoders.bean(Work.class));
currentWorksDS
.joinWith(
downloadedWorksDS,
currentWorksDS
.col("workDetail.id")
.equalTo(downloadedWorksDS.col("workDetail.id"))
.and(
currentWorksDS
.col("workDetail.oid")
.equalTo(downloadedWorksDS.col("workDetail.oid"))),
"full_outer")
.map(value -> {
Optional<Work> opCurrent = Optional.ofNullable(value._1());
Optional<Work> opDownloaded = Optional.ofNullable(value._2());
if (!opCurrent.isPresent()) {
newWorksFoundAcc.add(1);
return opDownloaded.get();
}
if (!opDownloaded.isPresent()) {
oldWorksFoundAcc.add(1);
return opCurrent.get();
}
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
updatedWorksFoundAcc.add(1);
return opDownloaded.get();
}
return null;
},
Encoders.bean(Work.class))
.filter(Objects::nonNull)
.toJavaRDD()
.map(work -> OBJECT_MAPPER.writeValueAsString(work))
.saveAsTextFile(workingPath.concat("orcid_dataset/new_works"), GzipCodec.class);
logger.info("oldWorksFoundAcc: " + oldWorksFoundAcc.value().toString());
logger.info("newWorksFoundAcc: " + newWorksFoundAcc.value().toString());
logger.info("updatedWorksFoundAcc: " + updatedWorksFoundAcc.value().toString());
logger.info("errorCodeWorksFoundAcc: " + errorCodeWorksFoundAcc.value().toString());
logger.info("errorLoadingJsonWorksFoundAcc: " + errorLoadingWorksJsonFoundAcc.value().toString());
logger.info("errorParsingXMLWorksFoundAcc: " + errorParsingWorksXMLFoundAcc.value().toString());
String lastModifiedDateFromLambdaFile = HDFSUtil
.readFromTextFile(hdfsServerUri, workingPath, "last_modified_date_from_lambda_file.txt");
HDFSUtil.writeToTextFile(hdfsServerUri, workingPath, "last_update.txt", lastModifiedDateFromLambdaFile);
logger.info("last_update file updated");
});
}
private static String getJsonValue(JsonElement jElement, String property) {
if (jElement.getAsJsonObject().has(property)) {
JsonElement name = null;
name = jElement.getAsJsonObject().get(property);
if (name != null && !name.isJsonNull()) {
return name.getAsString();
}
}
return "";
}
}

View File

@ -3,11 +3,11 @@ package eu.dnetlib.doiboost.orcid.json;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.dhp.schema.orcid.WorkDetail;
public class JsonHelper { public class JsonHelper {
public static String createOidWork(WorkDataNoDoi workData) { public static String createOidWork(WorkDetail workData) {
return new Gson().toJson(workData); return new Gson().toJson(workData);
} }
} }

View File

@ -0,0 +1,67 @@
package eu.dnetlib.doiboost.orcid.util;
import java.io.*;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors;
public class HDFSUtil {
static Logger logger = LoggerFactory.getLogger(HDFSUtil.class);
private static FileSystem getFileSystem(String hdfsServerUri) throws IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsServerUri);
FileSystem fileSystem = FileSystem.get(conf);
return fileSystem;
}
public static String readFromTextFile(String hdfsServerUri, String workingPath, String path) throws IOException {
FileSystem fileSystem = getFileSystem(hdfsServerUri);
Path toReadPath = new Path(workingPath.concat(path));
if (!fileSystem.exists(toReadPath)) {
throw new RuntimeException("File not exist: " + path);
}
logger.info("Last_update_path " + toReadPath.toString());
FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(toReadPath));
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
StringBuffer sb = new StringBuffer();
try {
String line;
while ((line = br.readLine()) != null) {
sb.append(line);
}
} finally {
br.close();
}
String buffer = sb.toString();
logger.info("Last_update: " + buffer);
return buffer;
}
public static void writeToTextFile(String hdfsServerUri, String workingPath, String path, String text)
throws IOException {
FileSystem fileSystem = getFileSystem(hdfsServerUri);
Path toWritePath = new Path(workingPath.concat(path));
if (fileSystem.exists(toWritePath)) {
fileSystem.delete(toWritePath, true);
}
FSDataOutputStream os = fileSystem.create(toWritePath);
BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8"));
br.write(text);
br.close();
}
}

View File

@ -1,22 +1,19 @@
package eu.dnetlib.doiboost.orcid.xml; package eu.dnetlib.doiboost.orcid.xml;
import java.util.Arrays; import java.io.IOException;
import java.util.List; import java.util.*;
import org.apache.commons.lang3.StringUtils;
import org.mortbay.log.Log; import org.mortbay.log.Log;
import com.ximpleware.AutoPilot; import com.ximpleware.*;
import com.ximpleware.EOFException;
import com.ximpleware.EncodingException;
import com.ximpleware.EntityException;
import com.ximpleware.ParseException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.dhp.schema.orcid.AuthorHistory;
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.model.WorkData;
public class XMLRecordParser { public class XMLRecordParser {
@ -32,9 +29,12 @@ public class XMLRecordParser {
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record"; private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
private static final String NS_RECORD = "record"; private static final String NS_RECORD = "record";
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error"; private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
private static final String NS_ACTIVITIES = "activities";
private static final String NS_ACTIVITIES_URL = "http://www.orcid.org/ns/activities";
private static final String NS_WORK = "work"; private static final String NS_WORK = "work";
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
private static final String NS_HISTORY = "history";
private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history";
private static final String NS_ERROR = "error"; private static final String NS_ERROR = "error";
@ -51,6 +51,7 @@ public class XMLRecordParser {
ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL); ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL); ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
ap.declareXPathNameSpace(NS_HISTORY, NS_HISTORY_URL);
AuthorData authorData = new AuthorData(); AuthorData authorData = new AuthorData();
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
@ -89,6 +90,46 @@ public class XMLRecordParser {
authorData.setOtherNames(otherNames); authorData.setOtherNames(otherNames);
} }
// final String creationMethod = VtdUtilityParser.getSingleValue(ap, vn, "//history:creation-method");
// if (StringUtils.isNoneBlank(creationMethod)) {
// authorData.setCreationMethod(creationMethod);
// }
//
// final String completionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:completion-date");
// if (StringUtils.isNoneBlank(completionDate)) {
// authorData.setCompletionDate(completionDate);
// }
//
// final String submissionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:submission-date");
// if (StringUtils.isNoneBlank(submissionDate)) {
// authorData.setSubmissionDate(submissionDate);
// }
//
// final String claimed = VtdUtilityParser.getSingleValue(ap, vn, "//history:claimed");
// if (StringUtils.isNoneBlank(claimed)) {
// authorData.setClaimed(Boolean.parseBoolean(claimed));
// }
//
// final String verifiedEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-email");
// if (StringUtils.isNoneBlank(verifiedEmail)) {
// authorData.setVerifiedEmail(Boolean.parseBoolean(verifiedEmail));
// }
//
// final String verifiedPrimaryEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-primary-email");
// if (StringUtils.isNoneBlank(verifiedPrimaryEmail)) {
// authorData.setVerifiedPrimaryEmail(Boolean.parseBoolean(verifiedPrimaryEmail));
// }
//
// final String deactivationDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:deactivation-date");
// if (StringUtils.isNoneBlank(deactivationDate)) {
// authorData.setDeactivationDate(deactivationDate);
// }
//
// final String lastModifiedDate = VtdUtilityParser
// .getSingleValue(ap, vn, "//history:history/common:last-modified-date");
// if (StringUtils.isNoneBlank(lastModifiedDate)) {
// authorData.setLastModifiedDate(lastModifiedDate);
// }
return authorData; return authorData;
} }
@ -139,6 +180,12 @@ public class XMLRecordParser {
return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code"); return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code");
} }
public static String retrieveWorkIdFromSummary(byte[] bytes, String defaultValue)
throws VtdException, ParseException {
return retrieveOrcidId(
bytes, defaultValue, NS_ACTIVITIES, NS_ACTIVITIES_URL, "//work:work-summary", "put-code");
}
private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath, private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath,
String idAttributeName) String idAttributeName)
throws VtdException, ParseException { throws VtdException, ParseException {
@ -148,6 +195,7 @@ public class XMLRecordParser {
final VTDNav vn = vg.getNav(); final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn); final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(ns, nsUrl); ap.declareXPathNameSpace(ns, nsUrl);
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
.getTextValuesWithAttributes( .getTextValuesWithAttributes(
ap, vn, xpath, Arrays.asList(idAttributeName)); ap, vn, xpath, Arrays.asList(idAttributeName));
@ -157,4 +205,144 @@ public class XMLRecordParser {
Log.info("id not found - default: " + defaultValue); Log.info("id not found - default: " + defaultValue);
return defaultValue; return defaultValue;
} }
public static Map<String, String> retrieveWorkIdLastModifiedDate(byte[] bytes)
throws ParseException, XPathParseException, NavException, XPathEvalException, IOException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
Map<String, String> workIdLastModifiedDate = new HashMap<>();
ap.selectXPath("//work:work-summary");
String workId = "";
while (ap.evalXPath() != -1) {
String lastModifiedDate = "";
int attr = vn.getAttrVal("put-code");
if (attr > -1) {
workId = vn.toNormalizedString(attr);
}
if (vn.toElement(VTDNav.FIRST_CHILD, "common:last-modified-date")) {
int val = vn.getText();
if (val != -1) {
lastModifiedDate = vn.toNormalizedString(val);
workIdLastModifiedDate.put(workId, lastModifiedDate);
}
vn.toElement(VTDNav.PARENT);
}
}
return workIdLastModifiedDate;
}
public static AuthorSummary VTDParseAuthorSummary(byte[] bytes)
throws VtdException, ParseException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL);
ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL);
ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
ap.declareXPathNameSpace(NS_HISTORY, NS_HISTORY_URL);
AuthorData authorData = retrieveAuthorData(ap, vn, bytes);
AuthorHistory authorHistory = retrieveAuthorHistory(ap, vn, bytes);
AuthorSummary authorSummary = new AuthorSummary();
authorSummary.setAuthorData(authorData);
authorSummary.setAuthorHistory(authorHistory);
return authorSummary;
}
private static AuthorData retrieveAuthorData(AutoPilot ap, VTDNav vn, byte[] bytes)
throws VtdException {
AuthorData authorData = new AuthorData();
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
if (!errors.isEmpty()) {
authorData.setErrorCode(errors.get(0));
return authorData;
}
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
.getTextValuesWithAttributes(
ap, vn, "//record:record", Arrays.asList("path"));
if (!recordNodes.isEmpty()) {
final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1);
authorData.setOid(oid);
} else {
return null;
}
final List<String> names = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names");
if (!names.isEmpty()) {
authorData.setName(names.get(0));
}
final List<String> surnames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name");
if (!surnames.isEmpty()) {
authorData.setSurname(surnames.get(0));
}
final List<String> creditNames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name");
if (!creditNames.isEmpty()) {
authorData.setCreditName(creditNames.get(0));
}
final List<String> otherNames = VtdUtilityParser.getTextValue(ap, vn, "//other-name:content");
if (!otherNames.isEmpty()) {
authorData.setOtherNames(otherNames);
}
return authorData;
}
private static AuthorHistory retrieveAuthorHistory(AutoPilot ap, VTDNav vn, byte[] bytes)
throws VtdException {
AuthorHistory authorHistory = new AuthorHistory();
final String creationMethod = VtdUtilityParser.getSingleValue(ap, vn, "//history:creation-method");
if (StringUtils.isNoneBlank(creationMethod)) {
authorHistory.setCreationMethod(creationMethod);
}
final String completionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:completion-date");
if (StringUtils.isNoneBlank(completionDate)) {
authorHistory.setCompletionDate(completionDate);
}
final String submissionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:submission-date");
if (StringUtils.isNoneBlank(submissionDate)) {
authorHistory.setSubmissionDate(submissionDate);
}
final String claimed = VtdUtilityParser.getSingleValue(ap, vn, "//history:claimed");
if (StringUtils.isNoneBlank(claimed)) {
authorHistory.setClaimed(Boolean.parseBoolean(claimed));
}
final String verifiedEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-email");
if (StringUtils.isNoneBlank(verifiedEmail)) {
authorHistory.setVerifiedEmail(Boolean.parseBoolean(verifiedEmail));
}
final String verifiedPrimaryEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-primary-email");
if (StringUtils.isNoneBlank(verifiedPrimaryEmail)) {
authorHistory.setVerifiedPrimaryEmail(Boolean.parseBoolean(verifiedPrimaryEmail));
}
final String deactivationDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:deactivation-date");
if (StringUtils.isNoneBlank(deactivationDate)) {
authorHistory.setDeactivationDate(deactivationDate);
}
final String lastModifiedDate = VtdUtilityParser
.getSingleValue(ap, vn, "//history:history/common:last-modified-date");
if (StringUtils.isNoneBlank(lastModifiedDate)) {
authorHistory.setLastModifiedDate(lastModifiedDate);
}
return authorHistory;
}
} }

View File

@ -19,8 +19,8 @@ import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log; import org.mortbay.log.Log;
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
/** /**
@ -87,29 +87,29 @@ public class ActivitiesDumpReader {
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
buffer.append(line); buffer.append(line);
} }
WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi WorkDetail workDetail = XMLRecordParserNoDoi
.VTDParseWorkData(buffer.toString().getBytes()); .VTDParseWorkData(buffer.toString().getBytes());
if (workDataNoDoi != null) { if (workDetail != null) {
if (workDataNoDoi.getErrorCode() != null) { if (workDetail.getErrorCode() != null) {
errorFromOrcidFound += 1; errorFromOrcidFound += 1;
Log Log
.debug( .debug(
"error from Orcid with code " "error from Orcid with code "
+ workDataNoDoi.getErrorCode() + workDetail.getErrorCode()
+ " for entry " + " for entry "
+ entry.getName()); + entry.getName());
continue; continue;
} }
boolean isDoiFound = workDataNoDoi boolean isDoiFound = workDetail
.getExtIds() .getExtIds()
.stream() .stream()
.filter(e -> e.getType() != null) .filter(e -> e.getType() != null)
.anyMatch(e -> e.getType().equals("doi")); .anyMatch(e -> e.getType().equals("doi"));
if (!isDoiFound) { if (!isDoiFound) {
String jsonData = JsonHelper.createOidWork(workDataNoDoi); String jsonData = JsonHelper.createOidWork(workDetail);
Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData); Log.debug("oid: " + workDetail.getOid() + " data: " + jsonData);
final Text key = new Text(workDataNoDoi.getOid()); final Text key = new Text(workDetail.getOid());
final Text value = new Text(jsonData); final Text value = new Text(jsonData);
try { try {

View File

@ -4,10 +4,12 @@ package eu.dnetlib.doiboost.orcidnodoi;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException; import java.io.IOException;
import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -18,6 +20,7 @@ import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.mortbay.log.Log;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -30,14 +33,17 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
import eu.dnetlib.dhp.schema.orcid.Work;
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
import scala.Tuple2; import scala.Tuple2;
/** /**
* This spark job generates one parquet file, containing orcid publications dataset * This spark job generates orcid publications no doi dataset
*/ */
public class SparkGenEnrichedOrcidWorks { public class SparkGenEnrichedOrcidWorks {
@ -53,47 +59,65 @@ public class SparkGenEnrichedOrcidWorks {
.toString( .toString(
SparkGenEnrichedOrcidWorks.class SparkGenEnrichedOrcidWorks.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); "/eu/dnetlib/dhp/doiboost/gen_orcid-no-doi_params.json")));
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
final String hdfsServerUri = parser.get("hdfsServerUri");
final String workingPath = parser.get("workingPath"); final String workingPath = parser.get("workingPath");
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath"); final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
final String outputWorksPath = parser.get("outputWorksPath"); final String orcidDataFolder = parser.get("orcidDataFolder");
final String hdfsServerUri = parser.get("hdfsServerUri");
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
if (StringUtils.isBlank(lastUpdate)) {
throw new RuntimeException("last update info not found");
}
final String dateOfCollection = lastUpdate.substring(0, 10);
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc Dataset<AuthorData> authorDataset = spark
.sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark
.createDataset( .createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), sc
.textFile(workingPath.concat(orcidDataFolder).concat("/authors/*"))
.map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
.filter(authorSummary -> authorSummary.getAuthorData() != null)
.map(authorSummary -> authorSummary.getAuthorData())
.rdd(),
Encoders.bean(AuthorData.class)); Encoders.bean(AuthorData.class));
logger.info("Authors data loaded: " + summariesDataset.count()); logger.info("Authors data loaded: " + authorDataset.count());
JavaPairRDD<Text, Text> activitiesRDD = sc Dataset<WorkDetail> workDataset = spark
.sequenceFile(workingPath + outputWorksPath + "*.seq", Text.class, Text.class);
Dataset<WorkDataNoDoi> activitiesDataset = spark
.createDataset( .createDataset(
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), sc
Encoders.bean(WorkDataNoDoi.class)); .textFile(workingPath.concat(orcidDataFolder).concat("/works/*"))
logger.info("Works data loaded: " + activitiesDataset.count()); .map(item -> OBJECT_MAPPER.readValue(item, Work.class))
.filter(work -> work.getWorkDetail() != null)
.map(work -> work.getWorkDetail())
.filter(work -> work.getErrorCode() == null)
.filter(
work -> work
.getExtIds()
.stream()
.filter(e -> e.getType() != null)
.noneMatch(e -> e.getType().equalsIgnoreCase("doi")))
.rdd(),
Encoders.bean(WorkDetail.class));
logger.info("Works data loaded: " + workDataset.count());
JavaRDD<Tuple2<String, String>> enrichedWorksRDD = activitiesDataset JavaRDD<Tuple2<String, String>> enrichedWorksRDD = workDataset
.joinWith( .joinWith(
summariesDataset, authorDataset,
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") workDataset.col("oid").equalTo(authorDataset.col("oid")), "inner")
.map( .map(
(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, String>>) value -> { (MapFunction<Tuple2<WorkDetail, AuthorData>, Tuple2<String, String>>) value -> {
WorkDataNoDoi w = value._1; WorkDetail w = value._1;
AuthorData a = value._2; AuthorData a = value._2;
AuthorMatcher.match(a, w.getContributors()); AuthorMatcher.match(a, w.getContributors());
return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w)); return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w));
@ -113,13 +137,25 @@ public class SparkGenEnrichedOrcidWorks {
.sparkContext() .sparkContext()
.longAccumulator("errorsNotFoundAuthors"); .longAccumulator("errorsNotFoundAuthors");
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType"); final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound");
final LongAccumulator deactivatedAcc = spark.sparkContext().longAccumulator("deactivated_found");
final LongAccumulator titleNotProvidedAcc = spark
.sparkContext()
.longAccumulator("Title_not_provided_found");
final LongAccumulator noUrlAcc = spark.sparkContext().longAccumulator("no_url_found");
final PublicationToOaf publicationToOaf = new PublicationToOaf( final PublicationToOaf publicationToOaf = new PublicationToOaf(
parsedPublications, parsedPublications,
enrichedPublications, enrichedPublications,
errorsGeneric, errorsGeneric,
errorsInvalidTitle, errorsInvalidTitle,
errorsNotFoundAuthors, errorsNotFoundAuthors,
errorsInvalidType); errorsInvalidType,
otherTypeFound,
deactivatedAcc,
titleNotProvidedAcc,
noUrlAcc,
dateOfCollection);
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
.map( .map(
e -> { e -> {
@ -148,33 +184,10 @@ public class SparkGenEnrichedOrcidWorks {
logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString()); logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString()); logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString()); logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
logger.info("otherTypeFound: " + otherTypeFound.value().toString());
logger.info("deactivatedAcc: " + deactivatedAcc.value().toString());
logger.info("titleNotProvidedAcc: " + titleNotProvidedAcc.value().toString());
logger.info("noUrlAcc: " + noUrlAcc.value().toString());
}); });
} }
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
AuthorData authorData = new AuthorData();
authorData.setOid(orcidId.toString());
JsonElement jElement = new JsonParser().parse(json.toString());
authorData.setName(getJsonValue(jElement, "name"));
authorData.setSurname(getJsonValue(jElement, "surname"));
authorData.setCreditName(getJsonValue(jElement, "creditname"));
return authorData;
}
private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
return workData;
}
private static String getJsonValue(JsonElement jElement, String property) {
if (jElement.getAsJsonObject().has(property)) {
JsonElement name = null;
name = jElement.getAsJsonObject().get(property);
if (name != null && !name.isJsonNull()) {
return name.getAsString();
}
}
return new String("");
}
} }

View File

@ -22,6 +22,10 @@ public class JsonWriter {
return OBJECT_MAPPER.writeValueAsString(authorData); return OBJECT_MAPPER.writeValueAsString(authorData);
} }
public static String create(Object obj) throws JsonProcessingException {
return OBJECT_MAPPER.writeValueAsString(obj);
}
public static String create(WorkData workData) { public static String create(WorkData workData) {
JsonObject work = new JsonObject(); JsonObject work = new JsonObject();
work.addProperty("oid", workData.getOid()); work.addProperty("oid", workData.getOid());

View File

@ -18,7 +18,6 @@ import com.google.gson.*;
import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility; import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
import eu.dnetlib.doiboost.orcidnodoi.util.Pair; import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
@ -26,21 +25,28 @@ import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
/** /**
* This class converts an orcid publication from json format to oaf * This class converts an orcid publication from json format to oaf
*/ */
public class PublicationToOaf implements Serializable { public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class); static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
public static final String ORCID = StringUtils.upperCase(ModelConstants.ORCID);
public final static String orcidPREFIX = "orcid_______"; public final static String orcidPREFIX = "orcid_______";
public static final String OPENAIRE_PREFIX = "openaire____"; public static final String OPENAIRE_PREFIX = "openaire____";
public static final String SEPARATOR = "::"; public static final String SEPARATOR = "::";
public static final String DEACTIVATED_NAME = "Given Names Deactivated";
public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
private String dateOfCollection = "";
private final LongAccumulator parsedPublications; private final LongAccumulator parsedPublications;
private final LongAccumulator enrichedPublications; private final LongAccumulator enrichedPublications;
private final LongAccumulator errorsGeneric; private final LongAccumulator errorsGeneric;
private final LongAccumulator errorsInvalidTitle; private final LongAccumulator errorsInvalidTitle;
private final LongAccumulator errorsNotFoundAuthors; private final LongAccumulator errorsNotFoundAuthors;
private final LongAccumulator errorsInvalidType; private final LongAccumulator errorsInvalidType;
private final LongAccumulator otherTypeFound;
private final LongAccumulator deactivatedAcc;
private final LongAccumulator titleNotProvidedAcc;
private final LongAccumulator noUrlAcc;
public PublicationToOaf( public PublicationToOaf(
LongAccumulator parsedPublications, LongAccumulator parsedPublications,
@ -48,13 +54,23 @@ public class PublicationToOaf implements Serializable {
LongAccumulator errorsGeneric, LongAccumulator errorsGeneric,
LongAccumulator errorsInvalidTitle, LongAccumulator errorsInvalidTitle,
LongAccumulator errorsNotFoundAuthors, LongAccumulator errorsNotFoundAuthors,
LongAccumulator errorsInvalidType) { LongAccumulator errorsInvalidType,
LongAccumulator otherTypeFound,
LongAccumulator deactivatedAcc,
LongAccumulator titleNotProvidedAcc,
LongAccumulator noUrlAcc,
String dateOfCollection) {
this.parsedPublications = parsedPublications; this.parsedPublications = parsedPublications;
this.enrichedPublications = enrichedPublications; this.enrichedPublications = enrichedPublications;
this.errorsGeneric = errorsGeneric; this.errorsGeneric = errorsGeneric;
this.errorsInvalidTitle = errorsInvalidTitle; this.errorsInvalidTitle = errorsInvalidTitle;
this.errorsNotFoundAuthors = errorsNotFoundAuthors; this.errorsNotFoundAuthors = errorsNotFoundAuthors;
this.errorsInvalidType = errorsInvalidType; this.errorsInvalidType = errorsInvalidType;
this.otherTypeFound = otherTypeFound;
this.deactivatedAcc = deactivatedAcc;
this.titleNotProvidedAcc = titleNotProvidedAcc;
this.noUrlAcc = noUrlAcc;
this.dateOfCollection = dateOfCollection;
} }
public PublicationToOaf() { public PublicationToOaf() {
@ -64,12 +80,19 @@ public class PublicationToOaf implements Serializable {
this.errorsInvalidTitle = null; this.errorsInvalidTitle = null;
this.errorsNotFoundAuthors = null; this.errorsNotFoundAuthors = null;
this.errorsInvalidType = null; this.errorsInvalidType = null;
this.otherTypeFound = null;
this.deactivatedAcc = null;
this.titleNotProvidedAcc = null;
this.noUrlAcc = null;
this.dateOfCollection = null;
} }
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() { private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
{ {
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + ModelConstants.ORCID)); put(
ModelConstants.ORCID,
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
} }
}; };
@ -79,10 +102,10 @@ public class PublicationToOaf implements Serializable {
{ {
put("ark".toLowerCase(), new Pair<>("ark", "ark")); put("ark".toLowerCase(), new Pair<>("ark", "ark"));
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv")); put("arxiv".toLowerCase(), new Pair<>("arXiv", "arXiv"));
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc")); put("pmc".toLowerCase(), new Pair<>("pmc", "PubMed Central ID"));
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid")); put("pmid".toLowerCase(), new Pair<>("pmid", "PubMed ID"));
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid")); put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcid workid"));
put("urn".toLowerCase(), new Pair<>("urn", "urn")); put("urn".toLowerCase(), new Pair<>("urn", "urn"));
} }
}; };
@ -102,21 +125,15 @@ public class PublicationToOaf implements Serializable {
} }
} }
public static final String PID_TYPES = "dnet:pid_types";
public Oaf generatePublicationActionsFromJson(final String json) { public Oaf generatePublicationActionsFromJson(final String json) {
try { if (parsedPublications != null) {
if (parsedPublications != null) { parsedPublications.add(1);
parsedPublications.add(1);
}
JsonElement jElement = new JsonParser().parse(json);
JsonObject jObject = jElement.getAsJsonObject();
return generatePublicationActionsFromDump(jObject);
} catch (Throwable t) {
logger.error("creating publication: " + t.getMessage());
if (errorsGeneric != null) {
errorsGeneric.add(1);
}
return null;
} }
JsonElement jElement = new JsonParser().parse(json);
JsonObject jObject = jElement.getAsJsonObject();
return generatePublicationActionsFromDump(jObject);
} }
public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) { public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
@ -142,7 +159,7 @@ public class PublicationToOaf implements Serializable {
publication.setLastupdatetimestamp(new Date().getTime()); publication.setLastupdatetimestamp(new Date().getTime());
publication.setDateofcollection("2020-10-14"); publication.setDateofcollection(dateOfCollection);
publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601()); publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
// Adding external ids // Adding external ids
@ -150,8 +167,8 @@ public class PublicationToOaf implements Serializable {
.keySet() .keySet()
.stream() .stream()
.forEach(jsonExtId -> { .forEach(jsonExtId -> {
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue(); final String classid = externalIds.get(jsonExtId.toLowerCase()).getKey();
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey(); final String classname = externalIds.get(jsonExtId.toLowerCase()).getValue();
final String extId = getStringValue(rootElement, jsonExtId); final String extId = getStringValue(rootElement, jsonExtId);
if (StringUtils.isNotBlank(extId)) { if (StringUtils.isNotBlank(extId)) {
publication publication
@ -182,11 +199,19 @@ public class PublicationToOaf implements Serializable {
} }
return null; return null;
} }
if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
if (titleNotProvidedAcc != null) {
titleNotProvidedAcc.add(1);
}
return null;
}
publication publication
.setTitle( .setTitle(
titles titles
.stream() .stream()
.map(t -> mapStructuredProperty(t, ModelConstants.MAIN_TITLE_QUALIFIER, null)) .map(t -> {
return mapStructuredProperty(t, ModelConstants.MAIN_TITLE_QUALIFIER, null);
})
.filter(s -> s != null) .filter(s -> s != null)
.collect(Collectors.toList())); .collect(Collectors.toList()));
// Adding identifier // Adding identifier
@ -216,8 +241,23 @@ public class PublicationToOaf implements Serializable {
mapQualifier( mapQualifier(
type, type, ModelConstants.DNET_DATA_CITE_RESOURCE, ModelConstants.DNET_DATA_CITE_RESOURCE)); type, type, ModelConstants.DNET_DATA_CITE_RESOURCE, ModelConstants.DNET_DATA_CITE_RESOURCE));
Map<String, String> publicationType = typologiesMapping.get(type);
if ((publicationType == null || publicationType.isEmpty()) && errorsInvalidType != null) {
errorsInvalidType.add(1);
logger.error("publication_type_not_found: " + type);
return null;
}
final String typeValue = typologiesMapping.get(type).get("value"); final String typeValue = typologiesMapping.get(type).get("value");
cobjValue = typologiesMapping.get(type).get("cobj"); cobjValue = typologiesMapping.get(type).get("cobj");
// this dataset must contain only publication
if (cobjValue.equals("0020")) {
if (otherTypeFound != null) {
otherTypeFound.add(1);
}
return null;
}
final Instance instance = new Instance(); final Instance instance = new Instance();
// Adding hostedby // Adding hostedby
@ -228,9 +268,14 @@ public class PublicationToOaf implements Serializable {
if (urls != null && !urls.isEmpty()) { if (urls != null && !urls.isEmpty()) {
instance.setUrl(urls); instance.setUrl(urls);
} else { } else {
dataInfo.setInvisible(true); if (noUrlAcc != null) {
noUrlAcc.add(1);
}
return null;
} }
dataInfo.setInvisible(true);
final String pubDate = getPublicationDate(rootElement, "publicationDates"); final String pubDate = getPublicationDate(rootElement, "publicationDates");
if (StringUtils.isNotBlank(pubDate)) { if (StringUtils.isNotBlank(pubDate)) {
instance.setDateofacceptance(mapStringField(pubDate, null)); instance.setDateofacceptance(mapStringField(pubDate, null));
@ -241,11 +286,9 @@ public class PublicationToOaf implements Serializable {
// Adding accessright // Adding accessright
instance instance
.setAccessright( .setAccessright(
OafUtils OafMapperUtils
.createAccessRight( .accessRight(
ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_ACCESS_MODES,
ModelConstants.UNKNOWN,
ModelConstants.DNET_ACCESS_MODES,
ModelConstants.DNET_ACCESS_MODES)); ModelConstants.DNET_ACCESS_MODES));
// Adding type // Adding type
@ -266,12 +309,28 @@ public class PublicationToOaf implements Serializable {
// Adding authors // Adding authors
final List<Author> authors = createAuthors(rootElement); final List<Author> authors = createAuthors(rootElement);
if (authors != null && authors.size() > 0) { if (authors != null && authors.size() > 0) {
publication.setAuthor(authors); if (authors.stream().filter(a -> {
} else { return ((Objects.nonNull(a.getName()) && a.getName().equals(DEACTIVATED_NAME)) ||
if (errorsNotFoundAuthors != null) { (Objects.nonNull(a.getSurname()) && a.getSurname().equals(DEACTIVATED_SURNAME)));
errorsNotFoundAuthors.add(1); }).count() > 0) {
if (deactivatedAcc != null) {
deactivatedAcc.add(1);
}
return null;
} else {
publication.setAuthor(authors);
}
} else {
if (authors == null) {
Gson gson = new GsonBuilder().setPrettyPrinting().create();
String json = gson.toJson(rootElement);
throw new RuntimeException("not_valid_authors: " + json);
} else {
if (errorsNotFoundAuthors != null) {
errorsNotFoundAuthors.add(1);
}
return null;
} }
return null;
} }
String classValue = getDefaultResulttype(cobjValue); String classValue = getDefaultResulttype(cobjValue);
publication publication
@ -518,36 +577,33 @@ public class PublicationToOaf implements Serializable {
private KeyValue createCollectedFrom() { private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue(); KeyValue cf = new KeyValue();
cf.setValue(ORCID); cf.setValue(ModelConstants.ORCID.toUpperCase());
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
return cf; return cf;
} }
private KeyValue createHostedBy() { private KeyValue createHostedBy() {
KeyValue hb = new KeyValue(); return ModelConstants.UNKNOWN_REPOSITORY;
hb.setValue("Unknown Repository");
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
return hb;
} }
private StructuredProperty mapAuthorId(String orcidId) { private StructuredProperty mapAuthorId(String orcidId) {
final StructuredProperty sp = new StructuredProperty(); final StructuredProperty sp = new StructuredProperty();
sp.setValue(orcidId); sp.setValue(orcidId);
final Qualifier q = new Qualifier(); final Qualifier q = new Qualifier();
q.setClassid(ORCID.toLowerCase()); q.setClassid(ModelConstants.ORCID);
q.setClassname(ORCID.toLowerCase()); q.setClassname(ModelConstants.ORCID_CLASSNAME);
q.setSchemeid(ModelConstants.DNET_PID_TYPES); q.setSchemeid(ModelConstants.DNET_PID_TYPES);
q.setSchemename(ModelConstants.DNET_PID_TYPES); q.setSchemename(ModelConstants.DNET_PID_TYPES);
sp.setQualifier(q); sp.setQualifier(q);
final DataInfo dataInfo = new DataInfo(); final DataInfo dataInfo = new DataInfo();
dataInfo.setDeletedbyinference(false); dataInfo.setDeletedbyinference(false);
dataInfo.setInferred(false); dataInfo.setInferred(false);
dataInfo.setTrust("0.9"); dataInfo.setTrust("0.91");
dataInfo dataInfo
.setProvenanceaction( .setProvenanceaction(
mapQualifier( mapQualifier(
"sysimport:crosswalk:entityregistry", ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
"Harvested", ModelConstants.HARVESTED,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS)); ModelConstants.DNET_PROVENANCE_ACTIONS));
sp.setDataInfo(dataInfo); sp.setDataInfo(dataInfo);

View File

@ -0,0 +1,77 @@
diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java (rejected hunks)
@@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
- public static final String ORCID = "ORCID";
- public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID";
public final static String orcidPREFIX = "orcid_______";
public static final String OPENAIRE_PREFIX = "openaire____";
public static final String SEPARATOR = "::";
+ public static final String DEACTIVATED_NAME = "Given Names Deactivated";
+ public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
private String dateOfCollection = "";
private final LongAccumulator parsedPublications;
@@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable {
this.errorsNotFoundAuthors = null;
this.errorsInvalidType = null;
this.otherTypeFound = null;
+ this.deactivatedAcc = null;
+ this.titleNotProvidedAcc = null;
+ this.noUrlAcc = null;
this.dateOfCollection = null;
}
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
{
- put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
+ put(
+ ModelConstants.ORCID,
+ new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
}
};
@@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable {
}
return null;
}
+ if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
+ if (titleNotProvidedAcc != null) {
+ titleNotProvidedAcc.add(1);
+ }
+ return null;
+ }
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
publication
.setTitle(
@@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable {
private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue();
- cf.setValue(ORCID);
+ cf.setValue(ModelConstants.ORCID.toUpperCase());
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
return cf;
}
private KeyValue createHostedBy() {
- KeyValue hb = new KeyValue();
- hb.setValue("Unknown Repository");
- hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
- return hb;
+ return ModelConstants.UNKNOWN_REPOSITORY;
}
private StructuredProperty mapAuthorId(String orcidId) {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(orcidId);
final Qualifier q = new Qualifier();
- q.setClassid(ORCID.toLowerCase());
- q.setClassname(ORCID_PID_TYPE_CLASSNAME);
+ q.setClassid(ModelConstants.ORCID);
+ q.setClassname(ModelConstants.ORCID_CLASSNAME);
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
q.setSchemename(ModelConstants.DNET_PID_TYPES);
sp.setQualifier(q);

View File

@ -19,8 +19,8 @@ import com.ximpleware.XPathParseException;
import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.dhp.schema.orcid.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.dhp.schema.orcid.WorkDetail;
/** /**
* This class is used for searching from a list of publication contributors a * This class is used for searching from a list of publication contributors a
@ -209,7 +209,7 @@ public class AuthorMatcher {
} }
} }
private static String toJson(WorkDataNoDoi work) { private static String toJson(WorkDetail work) {
GsonBuilder builder = new GsonBuilder(); GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create(); Gson gson = builder.create();
return gson.toJson(work); return gson.toJson(work);

View File

@ -12,10 +12,10 @@ import com.ximpleware.*;
import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.dhp.schema.orcid.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId; import eu.dnetlib.dhp.schema.orcid.ExternalId;
import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate; import eu.dnetlib.dhp.schema.orcid.PublicationDate;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.dhp.schema.orcid.WorkDetail;
/** /**
* This class is used for parsing xml data with vtd parser * This class is used for parsing xml data with vtd parser
@ -42,7 +42,7 @@ public class XMLRecordParserNoDoi {
private static final String NS_ERROR = "error"; private static final String NS_ERROR = "error";
public static WorkDataNoDoi VTDParseWorkData(byte[] bytes) public static WorkDetail VTDParseWorkData(byte[] bytes)
throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException, throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
NavException, XPathEvalException { NavException, XPathEvalException {
final VTDGen vg = new VTDGen(); final VTDGen vg = new VTDGen();
@ -54,7 +54,7 @@ public class XMLRecordParserNoDoi {
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL); ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
WorkDataNoDoi workData = new WorkDataNoDoi(); WorkDetail workData = new WorkDetail();
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
if (!errors.isEmpty()) { if (!errors.isEmpty()) {
workData.setErrorCode(errors.get(0)); workData.setErrorCode(errors.get(0));

View File

@ -1,3 +1,5 @@
[{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path", "paramRequired": true}, [{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path", "paramRequired": true},
{"paramName":"a", "paramLongName":"authorsPath", "paramDescription": "the path of the authors seq file", "paramRequired": true},
{"paramName":"xw", "paramLongName":"xmlWorksPath", "paramDescription": "the path of the works xml seq file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputDoiAuthorListPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true} {"paramName":"o", "paramLongName":"outputDoiAuthorListPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
] ]

View File

@ -1,7 +1,6 @@
[ [
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, {"paramName":"i", "paramLongName":"orcidDataFolder", "paramDescription": "the folder of orcid data", "paramRequired": true},
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true} {"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
] ]

View File

@ -1,18 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
</property>
<property>
<name>queueName</name>
<value>default</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -1,55 +1,99 @@
<workflow-app name="Gen_Doi_Author_List_WF" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="gen_doi_author_list" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>spark2MaxExecutors</name>
<value>20</value>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property> <property>
<name>workingPath</name> <name>oozie.action.sharelib.for.spark</name>
<description>the working dir base path</description> <value>${oozieActionShareLibForSpark2}</value>
</property> </property>
<property> </configuration>
<name>sparkDriverMemory</name> </global>
<description>memory for driver process</description>
</property> <start to="ResetWorkingPath"/>
<property>
<name>sparkExecutorMemory</name> <kill name="Kill">
<description>memory for individual executor</description> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</property> </kill>
<property>
<name>sparkExecutorCores</name> <action name="ResetWorkingPath">
<description>number of cores used by single executor</description> <fs>
</property> <delete path='${workingPath}/doi_author_list'/>
</parameters> </fs>
<ok to="GenDoiAuthorList"/>
<start to="ResetWorkingPath"/> <error to="Kill"/>
</action>
<kill name="Kill"> <action name="GenDoiAuthorList">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <spark xmlns="uri:oozie:spark-action:0.2">
</kill> <master>yarn-cluster</master>
<mode>cluster</mode>
<action name="ResetWorkingPath"> <name>GenDoiAuthorList</name>
<fs> <class>eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList</class>
<delete path='${workingPath_activities}/doi_author_list'/> <jar>dhp-doiboost-${projectVersion}.jar</jar>
</fs> <spark-opts>
<ok to="Gen_Doi_Author_List"/> --executor-memory=${sparkExecutorMemory}
<error to="Kill"/> --driver-memory=${sparkDriverMemory}
</action> --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
<action name="Gen_Doi_Author_List"> --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
<spark xmlns="uri:oozie:spark-action:0.2"> --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
<job-tracker>${jobTracker}</job-tracker> --conf spark.dynamicAllocation.enabled=true
<name-node>${nameNode}</name-node> --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
<master>yarn</master> </spark-opts>
<mode>cluster</mode> <arg>-w</arg><arg>${workingPath}/</arg>
<name>Gen_Doi_Author_List</name> <arg>-a</arg><arg>authors/authors.seq</arg>
<class>eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList</class> <arg>-xw</arg><arg>xml/works/*.seq</arg>
<jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar> <arg>-o</arg><arg>doi_author_list/</arg>
<spark-opts>--num-executors 10 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} </spark>
</spark-opts> <ok to="End"/>
<arg>-w</arg><arg>${workingPath}/</arg> <error to="Kill"/>
<arg>-o</arg><arg>doi_author_list/</arg> </action>
</spark>
<ok to="End"/> <end name="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app> </workflow-app>

View File

@ -0,0 +1,163 @@
<workflow-app name="update_orcid_datasets" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>spark2MaxExecutors</name>
<value>50</value>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="promoteOrcidAuthorsDataset"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/orcid_dataset/new_authors'/>
<delete path='${workingPath}/orcid_dataset/new_works'/>
</fs>
<ok to="UpdateOrcidAuthors"/>
<error to="Kill"/>
</action>
<action name="UpdateOrcidAuthors">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>UpdateOrcidAuthors</name>
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidAuthors</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg>
<arg>-o</arg><arg>-</arg>
<arg>-t</arg><arg>-</arg>
</spark>
<ok to="UpdateOrcidWorks"/>
<error to="Kill"/>
</action>
<action name="UpdateOrcidWorks">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>UpdateOrcidWorks</name>
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidWorks</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg>
<arg>-o</arg><arg>-</arg>
<arg>-t</arg><arg>-</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="promoteOrcidAuthorsDataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${workingPath}/orcid_dataset/authors"/>
<mkdir path="${workingPath}/orcid_dataset/authors"/>
</prepare>
<arg>${workingPath}/orcid_dataset/new_authors/*</arg>
<arg>${workingPath}/orcid_dataset/authors</arg>
</distcp>
<ok to="promoteOrcidWorksDataset"/>
<error to="Kill"/>
</action>
<action name="promoteOrcidWorksDataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${workingPath}/orcid_dataset/works"/>
<mkdir path="${workingPath}/orcid_dataset/works"/>
</prepare>
<arg>${workingPath}/orcid_dataset/new_works/*</arg>
<arg>${workingPath}/orcid_dataset/works</arg>
</distcp>
<ok to="CleanWorkingPath"/>
<error to="Kill"/>
</action>
<action name="CleanWorkingPath">
<fs>
<delete path='${workingPath}/orcid_dataset/new_authors'/>
<delete path='${workingPath}/orcid_dataset/new_works'/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,22 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.action.sharelib.for.java</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx4g</value>
</property>
</configuration>

View File

@ -1,9 +1,25 @@
<workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property>
<name>spark2UpdateStepMaxExecutors</name>
<value>50</value>
</property>
<property> <property>
<name>workingPath</name> <name>workingPath</name>
<description>the working dir base path</description> <description>the working dir base path</description>
</property> </property>
<property>
<name>oozie.action.sharelib.for.java</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx4g</value>
</property>
<property> <property>
<name>token</name> <name>token</name>
<description>access token</description> <description>access token</description>
@ -30,7 +46,7 @@
<description>number of cores used by single executor</description> <description>number of cores used by single executor</description>
</property> </property>
<property> <property>
<name>spark2MaxExecutors</name> <name>spark2DownloadingMaxExecutors</name>
<value>10</value> <value>10</value>
</property> </property>
<property> <property>
@ -58,6 +74,8 @@
</parameters> </parameters>
<global> <global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration> <configuration>
<property> <property>
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
@ -66,18 +84,16 @@
</configuration> </configuration>
</global> </global>
<start to="DownloadOrcidAuthors"/> <start to="ResetLambda"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="ResetWorkingPath"> <action name="ResetLambda">
<fs> <fs>
<delete path='${workingPath}/downloads'/>
<delete path='${workingPath}/last_modified.csv.tar'/> <delete path='${workingPath}/last_modified.csv.tar'/>
<mkdir path='${workingPath}/downloads'/> <delete path='${workingPath}/last_modified.seq'/>
</fs> </fs>
<ok to="DownloadLambdaFile"/> <ok to="DownloadLambdaFile"/>
<error to="Kill"/> <error to="Kill"/>
@ -92,22 +108,7 @@
<argument>${shell_cmd}</argument> <argument>${shell_cmd}</argument>
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="DownloadUpdatedXMLAuthors"/> <ok to="GenLastModifiedSeq"/>
<error to="Kill"/>
</action>
<action name="DownloadUpdatedXMLAuthors">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>last_modified.csv.tar</arg>
<arg>-o</arg><arg>downloads/</arg>
<arg>-t</arg><arg>${token}</arg>
</java>
<ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -133,7 +134,16 @@
<arg>-o</arg><arg>last_modified.seq</arg> <arg>-o</arg><arg>last_modified.seq</arg>
<arg>-t</arg><arg>-</arg> <arg>-t</arg><arg>-</arg>
</spark> </spark>
<ok to="End"/> <ok to="ResetDownloads"/>
<error to="Kill"/>
</action>
<action name="ResetDownloads">
<fs>
<delete path='${workingPath}/downloads/updated_authors'/>
<delete path='${workingPath}/downloads/updated_works'/>
</fs>
<ok to="DownloadOrcidAuthors"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -146,7 +156,7 @@
<jar>dhp-doiboost-${projectVersion}.jar</jar> <jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2DownloadingMaxExecutors}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
@ -160,9 +170,151 @@
<arg>-o</arg><arg>downloads/updated_authors</arg> <arg>-o</arg><arg>downloads/updated_authors</arg>
<arg>-t</arg><arg>${token}</arg> <arg>-t</arg><arg>${token}</arg>
</spark> </spark>
<ok to="DownloadOrcidWorks"/>
<error to="Kill"/>
</action>
<action name="DownloadOrcidWorks">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>DownloadOrcidWorks</name>
<class>eu.dnetlib.doiboost.orcid.SparkDownloadOrcidWorks</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2DownloadingMaxExecutors}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg>
<arg>-o</arg><arg>downloads/updated_works</arg>
<arg>-t</arg><arg>${token}</arg>
</spark>
<ok to="UpdateOrcidAuthors"/>
<error to="Kill"/>
</action>
<action name="UpdateOrcidAuthors">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>UpdateOrcidAuthors</name>
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidAuthors</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2UpdateStepMaxExecutors}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg>
<arg>-o</arg><arg>-</arg>
<arg>-t</arg><arg>-</arg>
</spark>
<ok to="UpdateOrcidWorks"/>
<error to="Kill"/>
</action>
<action name="UpdateOrcidWorks">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>UpdateOrcidWorks</name>
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidWorks</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2UpdateStepMaxExecutors}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg>
<arg>-o</arg><arg>-</arg>
<arg>-t</arg><arg>-</arg>
</spark>
<ok to="promoteOrcidAuthorsDataset"/>
<error to="Kill"/>
</action>
<action name="promoteOrcidAuthorsDataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${workingPath}/orcid_dataset/authors"/>
<mkdir path="${workingPath}/orcid_dataset/authors"/>
</prepare>
<arg>${workingPath}/orcid_dataset/new_authors/*</arg>
<arg>${workingPath}/orcid_dataset/authors</arg>
</distcp>
<ok to="promoteOrcidWorksDataset"/>
<error to="Kill"/>
</action>
<action name="promoteOrcidWorksDataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${workingPath}/orcid_dataset/works"/>
<mkdir path="${workingPath}/orcid_dataset/works"/>
</prepare>
<arg>${workingPath}/orcid_dataset/new_works/*</arg>
<arg>${workingPath}/orcid_dataset/works</arg>
</distcp>
<ok to="CleanWorkingPath"/>
<error to="Kill"/>
</action>
<action name="CleanWorkingPath">
<fs>
<delete path='${workingPath}/orcid_dataset/new_authors'/>
<delete path='${workingPath}/orcid_dataset/new_works'/>
</fs>
<ok to="updateLastOrcidAuthorsDataset"/>
<error to="Kill"/>
</action>
<action name="updateLastOrcidAuthorsDataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${workingPath}/last_orcid_dataset/authors"/>
<mkdir path="${workingPath}/last_orcid_dataset/authors"/>
</prepare>
<arg>${workingPath}/orcid_dataset/authors/*</arg>
<arg>${workingPath}/last_orcid_dataset/authors</arg>
</distcp>
<ok to="updateLastOrcidWorksDataset"/>
<error to="Kill"/>
</action>
<action name="updateLastOrcidWorksDataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${workingPath}/last_orcid_dataset/works"/>
<mkdir path="${workingPath}/last_orcid_dataset/works"/>
</prepare>
<arg>${workingPath}/orcid_dataset/works/*</arg>
<arg>${workingPath}/last_orcid_dataset/works</arg>
</distcp>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,19 +1,9 @@
{ {
"reference-entry": {"cobj":"0013", "value": "Part of book or chapter of book"},
"report": {"cobj":"0017", "value": "Report"}, "report": {"cobj":"0017", "value": "Report"},
"dataset": {"cobj":"0021", "value": "Dataset"},
"journal-article": {"cobj":"0001", "value": "Article"}, "journal-article": {"cobj":"0001", "value": "Article"},
"reference-book": {"cobj":"0002", "value": "Book"},
"other": {"cobj":"0020", "value": "Other ORP type"}, "other": {"cobj":"0020", "value": "Other ORP type"},
"proceedings-article": {"cobj":"0004", "value": "Conference object"},
"standard": {"cobj":"0038", "value": "Other literature type"},
"book-part": {"cobj":"0002", "value": "Book"},
"monograph": {"cobj":"0002", "value": "Book"},
"report-series": {"cobj":"0017", "value": "Report"},
"book": {"cobj":"0002", "value": "Book"}, "book": {"cobj":"0002", "value": "Book"},
"book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"}, "book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"},
"peer-review": {"cobj":"0015", "value": "Review"},
"book-section": {"cobj":"0013", "value": "Part of book or chapter of book"},
"book-review": {"cobj":"0015", "value": "Review"}, "book-review": {"cobj":"0015", "value": "Review"},
"conference-abstract": {"cobj":"0004", "value": "Conference object"}, "conference-abstract": {"cobj":"0004", "value": "Conference object"},
"conference-paper": {"cobj":"0004", "value": "Conference object"}, "conference-paper": {"cobj":"0004", "value": "Conference object"},
@ -21,7 +11,7 @@
"data-set": {"cobj":"0021", "value": "Dataset"}, "data-set": {"cobj":"0021", "value": "Dataset"},
"dictionary-entry": {"cobj":"0038", "value": "Other literature type"}, "dictionary-entry": {"cobj":"0038", "value": "Other literature type"},
"disclosure": {"cobj":"0038", "value": "Other literature type"}, "disclosure": {"cobj":"0038", "value": "Other literature type"},
"dissertation": {"cobj":"0006", "value": "Doctoral thesis"}, "dissertation-thesis": {"cobj":"0006", "value": "Doctoral thesis"},
"edited-book": {"cobj":"0002", "value": "Book"}, "edited-book": {"cobj":"0002", "value": "Book"},
"encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"}, "encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"},
"lecture-speech": {"cobj":"0010", "value": "Lecture"}, "lecture-speech": {"cobj":"0010", "value": "Lecture"},
@ -37,5 +27,17 @@
"supervised-student-publication": {"cobj":"0001", "value": "Article"}, "supervised-student-publication": {"cobj":"0001", "value": "Article"},
"technical-standard": {"cobj":"0038", "value": "Other literature type"}, "technical-standard": {"cobj":"0038", "value": "Other literature type"},
"website": {"cobj":"0020", "value": "Other ORP type"}, "website": {"cobj":"0020", "value": "Other ORP type"},
"working-paper": {"cobj":"0014", "value": "Research"} "working-paper": {"cobj":"0014", "value": "Research"},
"annotation": {"cobj":"0018", "value": "Annotation"},
"physical-object": {"cobj":"0028", "value": "PhysicalObject"},
"preprint": {"cobj":"0016", "value": "Preprint"},
"software": {"cobj":"0029", "value": "Software"},
"journal-issue": {"cobj":"0001", "value": "Article"},
"translation": {"cobj":"0038", "value": "Other literature type"},
"artistic-performance": {"cobj":"0020", "value": "Other ORP type"},
"online-resource": {"cobj":"0020", "value": "Other ORP type"},
"registered-copyright": {"cobj":"0020", "value": "Other ORP type"},
"trademark": {"cobj":"0020", "value": "Other ORP type"},
"invention": {"cobj":"0020", "value": "Other ORP type"},
"spin-off-company": {"cobj":"0020", "value": "Other ORP type"}
} }

View File

@ -1,17 +1,18 @@
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property>
<name>spark2GenNoDoiDatasetMaxExecutors</name>
<value>40</value>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
</property> </property>
<property> <property>
<name>sparkExecutorMemory</name> <name>spark2GenNoDoiDatasetExecutorMemory</name>
<value>2G</value>
<description>memory for individual executor</description> <description>memory for individual executor</description>
</property> </property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property> <property>
<name>oozieActionShareLibForSpark2</name> <name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description> <description>oozie action sharelib for spark 2.*</description>
@ -73,8 +74,9 @@
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class> <class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar> <jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --conf spark.dynamicAllocation.enabled=true
--executor-cores=${sparkExecutorCores} --conf spark.dynamicAllocation.maxExecutors=${spark2GenNoDoiDatasetMaxExecutors}
--executor-memory=${spark2GenNoDoiDatasetExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -83,8 +85,7 @@
</spark-opts> </spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg> <arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg> <arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg> <arg>-i</arg><arg>last_orcid_dataset</arg>
<arg>-ow</arg><arg>no_doi_works/</arg>
<arg>-oew</arg><arg>no_doi_dataset</arg> <arg>-oew</arg><arg>no_doi_dataset</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>

View File

@ -10,30 +10,28 @@ import java.nio.file.Paths;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.time.Duration; import java.util.*;
import java.time.LocalDateTime;
import java.time.temporal.TemporalUnit;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.compress.utils.Lists;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.client.HttpClients;
import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParserTest;
import jdk.nashorn.internal.ir.annotations.Ignore; import jdk.nashorn.internal.ir.annotations.Ignore;
public class OrcidClientTest { public class OrcidClientTest {
final String orcidId = "0000-0001-7291-3210";
final int REQ_LIMIT = 24; final int REQ_LIMIT = 24;
final int REQ_MAX_TEST = 100; final int REQ_MAX_TEST = 100;
final int RECORD_DOWNLOADED_COUNTER_LOG_INTERVAL = 10; final int RECORD_DOWNLOADED_COUNTER_LOG_INTERVAL = 10;
@ -42,69 +40,45 @@ public class OrcidClientTest {
String toNotRetrieveDate = "2019-09-29 23:59:59.000000"; String toNotRetrieveDate = "2019-09-29 23:59:59.000000";
String lastUpdate = "2019-09-30 00:00:00"; String lastUpdate = "2019-09-30 00:00:00";
String shortDate = "2020-05-06 16:06:11"; String shortDate = "2020-05-06 16:06:11";
final String REQUEST_TYPE_RECORD = "record";
final String REQUEST_TYPE_WORK = "work/47652866";
final String REQUEST_TYPE_WORKS = "works";
private static Path testPath;
@BeforeAll
private static void setUp() throws IOException {
testPath = Files.createTempDirectory(XMLRecordParserTest.class.getName());
System.out.println("using test path: " + testPath);
}
// curl -i -H "Accept: application/vnd.orcid+xml" // curl -i -H "Accept: application/vnd.orcid+xml"
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d' // -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record' // 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
@Test @Test
private void multipleDownloadTest() throws Exception { public void downloadTest() throws Exception {
int toDownload = 10; final String orcid = "0000-0001-7291-3210";
long start = System.currentTimeMillis(); String record = testDownloadRecord(orcid, REQUEST_TYPE_RECORD);
OrcidDownloader downloader = new OrcidDownloader(); String filename = testPath + "/downloaded_record_".concat(orcid).concat(".xml");
TarArchiveInputStream input = new TarArchiveInputStream(
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
TarArchiveEntry entry = input.getNextTarEntry();
BufferedReader br = null;
StringBuilder sb = new StringBuilder();
int rowNum = 0;
int entryNum = 0;
int modified = 0;
while (entry != null) {
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
String line;
while ((line = br.readLine()) != null) {
String[] values = line.toString().split(",");
List<String> recordInfo = Arrays.asList(values);
String orcidId = recordInfo.get(0);
if (downloader.isModified(orcidId, recordInfo.get(3))) {
slowedDownDownload(orcidId);
modified++;
}
rowNum++;
if (modified > toDownload) {
break;
}
}
entryNum++;
entry = input.getNextTarEntry();
}
long end = System.currentTimeMillis();
logToFile("start test: " + new Date(start).toString());
logToFile("end test: " + new Date(end).toString());
}
@Test
private void downloadTest(String orcid) throws Exception {
String record = testDownloadRecord(orcid);
String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
File f = new File(filename); File f = new File(filename);
OutputStream outStream = new FileOutputStream(f); OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream); IOUtils.write(record.getBytes(), outStream);
} }
private String testDownloadRecord(String orcidId) throws Exception { private String testDownloadRecord(String orcidId, String dataType) throws Exception {
try (CloseableHttpClient client = HttpClients.createDefault()) { try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/" + dataType);
httpGet.addHeader("Accept", "application/vnd.orcid+xml"); httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d"); httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
logToFile("start connection: " + new Date(System.currentTimeMillis()).toString()); long start = System.currentTimeMillis();
CloseableHttpResponse response = client.execute(httpGet); CloseableHttpResponse response = client.execute(httpGet);
logToFile("end connection: " + new Date(System.currentTimeMillis()).toString()); long end = System.currentTimeMillis();
if (response.getStatusLine().getStatusCode() != 200) { if (response.getStatusLine().getStatusCode() != 200) {
System.out logToFile(
.println("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode()); testPath, "Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
} }
logToFile(testPath, orcidId + " " + dataType + " " + (end - start) / 1000 + " seconds");
return IOUtils.toString(response.getEntity().getContent()); return IOUtils.toString(response.getEntity().getContent());
} catch (Throwable e) { } catch (Throwable e) {
e.printStackTrace(); e.printStackTrace();
@ -129,7 +103,7 @@ public class OrcidClientTest {
} }
String[] values = line.split(","); String[] values = line.split(",");
List<String> recordInfo = Arrays.asList(values); List<String> recordInfo = Arrays.asList(values);
testDownloadRecord(recordInfo.get(0)); testDownloadRecord(recordInfo.get(0), REQUEST_TYPE_RECORD);
long endReq = System.currentTimeMillis(); long endReq = System.currentTimeMillis();
nReqTmp++; nReqTmp++;
if (nReqTmp == REQ_LIMIT) { if (nReqTmp == REQ_LIMIT) {
@ -189,20 +163,24 @@ public class OrcidClientTest {
final String base64CompressedRecord = IOUtils final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64")); .toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord); final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
logToFile("\n\ndownloaded \n\n" + recordFromSeqFile); logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile);
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161"); final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161", REQUEST_TYPE_RECORD);
assertTrue(recordFromSeqFile.equals(downloadedRecord)); assertTrue(recordFromSeqFile.equals(downloadedRecord));
} }
@Test @Test
private void lambdaFileReaderTest() throws Exception { @Disabled
public void lambdaFileReaderTest() throws Exception {
String last_update = "2021-01-12 00:00:06.685137";
TarArchiveInputStream input = new TarArchiveInputStream( TarArchiveInputStream input = new TarArchiveInputStream(
new GzipCompressorInputStream(new FileInputStream("/develop/last_modified.csv.tar"))); new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
TarArchiveEntry entry = input.getNextTarEntry(); TarArchiveEntry entry = input.getNextTarEntry();
BufferedReader br = null; BufferedReader br = null;
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
int rowNum = 0; int rowNum = 1;
int modifiedNum = 1;
int entryNum = 0; int entryNum = 0;
boolean firstNotModifiedFound = false;
while (entry != null) { while (entry != null) {
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
String line; String line;
@ -210,59 +188,44 @@ public class OrcidClientTest {
String[] values = line.toString().split(","); String[] values = line.toString().split(",");
List<String> recordInfo = Arrays.asList(values); List<String> recordInfo = Arrays.asList(values);
assertTrue(recordInfo.size() == 4); assertTrue(recordInfo.size() == 4);
String orcid = recordInfo.get(0);
String modifiedDate = recordInfo.get(3);
rowNum++; rowNum++;
if (rowNum == 1) { if (rowNum == 2) {
assertTrue(recordInfo.get(3).equals("last_modified")); assertTrue(recordInfo.get(3).equals("last_modified"));
} else if (rowNum == 2) { } else {
assertTrue(recordInfo.get(0).equals("0000-0002-0499-7333")); // SparkDownloadOrcidAuthors.lastUpdate = last_update;
// boolean isModified = SparkDownloadOrcidAuthors.isModified(orcid, modifiedDate);
// if (isModified) {
// modifiedNum++;
// } else {
// if (!firstNotModifiedFound) {
// firstNotModifiedFound = true;
// logToFile(orcid + " - " + modifiedDate + " > " + isModified);
// }
// }
} }
} }
entryNum++; entryNum++;
assertTrue(entryNum == 1); assertTrue(entryNum == 1);
entry = input.getNextTarEntry(); entry = input.getNextTarEntry();
} }
logToFile(testPath, "modifiedNum : " + modifiedNum + " / " + rowNum);
} }
@Test public static void logToFile(Path basePath, String log) throws IOException {
private void lambdaFileCounterTest() throws Exception {
final String lastUpdate = "2020-09-29 00:00:00";
OrcidDownloader downloader = new OrcidDownloader();
TarArchiveInputStream input = new TarArchiveInputStream(
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
TarArchiveEntry entry = input.getNextTarEntry();
BufferedReader br = null;
StringBuilder sb = new StringBuilder();
int rowNum = 0;
int entryNum = 0;
int modified = 0;
while (entry != null) {
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
String line;
while ((line = br.readLine()) != null) {
String[] values = line.toString().split(",");
List<String> recordInfo = Arrays.asList(values);
String orcidId = recordInfo.get(0);
if (downloader.isModified(orcidId, recordInfo.get(3))) {
modified++;
}
rowNum++;
}
entryNum++;
entry = input.getNextTarEntry();
}
logToFile("rowNum: " + rowNum);
logToFile("modified: " + modified);
}
private void logToFile(String log)
throws IOException {
log = log.concat("\n"); log = log.concat("\n");
Path path = Paths.get("/tmp/orcid_log.txt"); Path path = basePath.resolve("orcid_log.txt");
if (!Files.exists(path)) {
Files.createFile(path);
}
Files.write(path, log.getBytes(), StandardOpenOption.APPEND); Files.write(path, log.getBytes(), StandardOpenOption.APPEND);
} }
@Test @Test
@Disabled
private void slowedDownDownloadTest() throws Exception { private void slowedDownDownloadTest() throws Exception {
String orcid = "0000-0001-5496-1243"; String orcid = "0000-0001-5496-1243";
String record = slowedDownDownload(orcid); String record = slowedDownDownload(orcid);
@ -281,16 +244,17 @@ public class OrcidClientTest {
CloseableHttpResponse response = client.execute(httpGet); CloseableHttpResponse response = client.execute(httpGet);
long endReq = System.currentTimeMillis(); long endReq = System.currentTimeMillis();
long reqSessionDuration = endReq - start; long reqSessionDuration = endReq - start;
logToFile("req time (millisec): " + reqSessionDuration); logToFile(testPath, "req time (millisec): " + reqSessionDuration);
if (reqSessionDuration < 1000) { if (reqSessionDuration < 1000) {
logToFile("wait ...."); logToFile(testPath, "wait ....");
Thread.sleep(1000 - reqSessionDuration); Thread.sleep(1000 - reqSessionDuration);
} }
long end = System.currentTimeMillis(); long end = System.currentTimeMillis();
long total = end - start; long total = end - start;
logToFile("total time (millisec): " + total); logToFile(testPath, "total time (millisec): " + total);
if (response.getStatusLine().getStatusCode() != 200) { if (response.getStatusLine().getStatusCode() != 200) {
logToFile("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode()); logToFile(
testPath, "Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
} }
return IOUtils.toString(response.getEntity().getContent()); return IOUtils.toString(response.getEntity().getContent());
} catch (Throwable e) { } catch (Throwable e) {
@ -298,4 +262,89 @@ public class OrcidClientTest {
} }
return new String(""); return new String("");
} }
@Test
public void downloadWorkTest() throws Exception {
String orcid = "0000-0003-0015-1952";
String record = testDownloadRecord(orcid, REQUEST_TYPE_WORK);
String filename = "/tmp/downloaded_work_".concat(orcid).concat(".xml");
File f = new File(filename);
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
}
@Test
public void downloadRecordTest() throws Exception {
String orcid = "0000-0001-5004-5918";
String record = testDownloadRecord(orcid, REQUEST_TYPE_RECORD);
String filename = "/tmp/downloaded_record_".concat(orcid).concat(".xml");
File f = new File(filename);
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
}
@Test
public void downloadWorksTest() throws Exception {
String orcid = "0000-0001-5004-5918";
String record = testDownloadRecord(orcid, REQUEST_TYPE_WORKS);
String filename = "/tmp/downloaded_works_".concat(orcid).concat(".xml");
File f = new File(filename);
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
}
@Test
public void downloadSingleWorkTest() throws Exception {
String orcid = "0000-0001-5004-5918";
String record = testDownloadRecord(orcid, REQUEST_TYPE_WORK);
String filename = "/tmp/downloaded_work_47652866_".concat(orcid).concat(".xml");
File f = new File(filename);
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
}
@Test
public void cleanAuthorListTest() throws Exception {
AuthorData a1 = new AuthorData();
a1.setOid("1");
a1.setName("n1");
a1.setSurname("s1");
a1.setCreditName("c1");
AuthorData a2 = new AuthorData();
a2.setOid("1");
a2.setName("n1");
a2.setSurname("s1");
a2.setCreditName("c1");
AuthorData a3 = new AuthorData();
a3.setOid("3");
a3.setName("n3");
a3.setSurname("s3");
a3.setCreditName("c3");
List<AuthorData> list = Lists.newArrayList();
list.add(a1);
list.add(a2);
list.add(a3);
Set<String> namesAlreadySeen = new HashSet<>();
assertTrue(list.size() == 3);
list.removeIf(a -> !namesAlreadySeen.add(a.getOid()));
assertTrue(list.size() == 2);
}
@Test
@Ignore
public void testUpdatedRecord() throws Exception {
final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
logToFile(testPath, "\n\nrecord updated \n\n" + record);
}
@Test
@Ignore
private void testUpdatedWork() throws Exception {
final String base64CompressedWork = "H4sIAAAAAAAAAM1XS2/jNhC+51cQOuxJsiXZSR03Vmq0G6Bo013E6R56oyXaZiOJWpKy4y783zvUg5Ksh5uiCJogisX5Zjj85sHx3f1rFKI94YKyeGE4I9tAJPZZQOPtwvj9+cGaGUhIHAc4ZDFZGEcijHvv6u7A+MtcPVCSSgsUQObYzuzaccBEguVuYYxt+LHgbwKP6a11M3WnY6UzrpB7KuiahlQeF0aSrkPqGwhcisWcxpLwGIcLYydlMh+PD4fDiHGfBvDcjmMxLhGlBglSH8vsIH0qGlLqBFRIGvvDWjWQ1iMJJ2CKBANqGlNqMbkj3IpxRPq1KkypFZFoDRHa0aRfq8JoNjhnfIAJJS6xPouiIQJyeYmGQzE+cO5cXqITcItBlKyASExD0a93jiwtvJDjYXDDAqBPHoH2wMmVWGNf8xyyaEBiSTeUDHHWBpd2Nmmc10yfbgHQrHCyIRxKjQwRUoFKPRwEnIgBnQJQVdGeQgJaCRN0OMnPkaUFVbD9WkpaIndQJowf+8EFoIpTErJjBFQOBavElFpfUxwC9ZcqvQErdQXhe+oPFF8BaObupYzVsYEOARzSoZBWmKqaBMHcV0Wf8oG0beIqD+Gdkz0lhyE3NajUW6fhQFSV9Nw/MCBYyofYa0EN7wrBz13eP+Y+J6obWgE8Pdd2JpYD94P77Ezmjj13b0bu5PqPu3EXumEnxEJaEVxSUIHammsra+53z44zt2/m1/bItaeVtQ6dhs3c4XytvW75IYUchMKvEHVUyqmnWBFAS0VJrqSvQde6vp251ux2NtFuKcVOi+oK9YY0M0Cn6o4J6WkvtEK2XJ1vfPGAZxSoK8lb+SxJBbLQx1CohOLndjJUywQWUFmqEi3G6Zaqf/7buOyYJd5IYpfmf0XipfP18pDR9cQCeEuJQI/Lx36bFbVnpBeL2UwmqQw7ApAvf4GeGGQdEbENgolui/wdpjHaYCmPCIPPAmGBIsxfoLUhyRCB0SeCakEBJRKBtfJ+UBbI15TG4PaGBAhWthx8DmFYtHZQujv1CWbLLdzmmUKmHEOWCe1/zdu78bn/+YH+hCOqOzcXfFwuP6OVT/P710crwqGXFrpNaM2GT3MXarw01i15TIi3pmtJXgtbTVGf3h6HKfF+wBAnPyTfdCChudlm5gZaoG//F9pPZsGQcqqbyZN5hBau5OoIJ3PPwjTKDuG4s5MZp2rMzF5PZoK34IT6PIFOPrk+mTiVO5aJH2C+JJRjE/06eoRfpJxa4VgyYaLlaJUv/EhCfATMU/76gEOfmehL/qbJNNHjaFna+CQYB8wvo9PpPFJ5MOrJ1Ix7USBZqBl7KRNOx1d3jex7SG6zuijqCMWRusBsncjZSrM2u82UJmqzpGhvUJN2t6caIM9QQgO9c0t40UROnWsJd2Rbs+nsxpna9u30ttNkjechmzHjEST+X5CkkuNY0GzQkzyFseAf7lSZuLwdh1xSXKvvQJ4g4abTYgPV7uMt3rskohlJmMa82kQkshtyBEIYqQ+YB8X3oRHg7iFKi/bZP+Ao+T6BJhIT/vNPi8ffZs+flk+r2v0WNroZiyWn6xRmadHqTJXsjLJczElAZX6TnJdoWTM1SI2gfutv3rjeBt5t06rVvNuWup29246tlvluO+u2/G92bK9DXheL6uFd/Q3EaRDZqBIAAA==";
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
logToFile(testPath, "\n\nwork updated \n\n" + work);
}
} }

View File

@ -1,20 +1,44 @@
package eu.dnetlib.doiboost.orcid.xml; package eu.dnetlib.doiboost.orcid.xml;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
import eu.dnetlib.dhp.schema.orcid.Work;
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcid.OrcidClientTest;
import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.model.WorkData;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
public class XMLRecordParserTest { public class XMLRecordParserTest {
private static final String NS_WORK = "work";
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
private static final String NS_COMMON = "common";
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static Path testPath;
@BeforeAll
private static void setUp() throws IOException {
testPath = Files.createTempDirectory(XMLRecordParserTest.class.getName());
}
@Test @Test
private void testOrcidAuthorDataXMLParser() throws Exception { public void testOrcidAuthorDataXMLParser() throws Exception {
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml")); String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
@ -26,10 +50,11 @@ public class XMLRecordParserTest {
System.out.println("name: " + authorData.getName()); System.out.println("name: " + authorData.getName());
assertNotNull(authorData.getSurname()); assertNotNull(authorData.getSurname());
System.out.println("surname: " + authorData.getSurname()); System.out.println("surname: " + authorData.getSurname());
OrcidClientTest.logToFile(testPath, OBJECT_MAPPER.writeValueAsString(authorData));
} }
@Test @Test
private void testOrcidXMLErrorRecordParser() throws Exception { public void testOrcidXMLErrorRecordParser() throws Exception {
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml")); String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
@ -42,7 +67,7 @@ public class XMLRecordParserTest {
} }
@Test @Test
private void testOrcidWorkDataXMLParser() throws Exception { public void testOrcidWorkDataXMLParser() throws Exception {
String xml = IOUtils String xml = IOUtils
.toString( .toString(
@ -54,8 +79,7 @@ public class XMLRecordParserTest {
assertNotNull(workData); assertNotNull(workData);
assertNotNull(workData.getOid()); assertNotNull(workData.getOid());
System.out.println("oid: " + workData.getOid()); System.out.println("oid: " + workData.getOid());
assertNotNull(workData.getDoi()); assertNull(workData.getDoi());
System.out.println("doi: " + workData.getDoi());
} }
@Test @Test
@ -64,9 +88,6 @@ public class XMLRecordParserTest {
String xml = IOUtils String xml = IOUtils
.toString( .toString(
this.getClass().getResourceAsStream("summary_0000-0001-5109-1000_othername.xml")); this.getClass().getResourceAsStream("summary_0000-0001-5109-1000_othername.xml"));
XMLRecordParser p = new XMLRecordParser();
AuthorData authorData = XMLRecordParser.VTDParseAuthorData(xml.getBytes()); AuthorData authorData = XMLRecordParser.VTDParseAuthorData(xml.getBytes());
assertNotNull(authorData); assertNotNull(authorData);
assertNotNull(authorData.getOtherNames()); assertNotNull(authorData.getOtherNames());
@ -74,4 +95,43 @@ public class XMLRecordParserTest {
String jsonData = JsonWriter.create(authorData); String jsonData = JsonWriter.create(authorData);
assertNotNull(jsonData); assertNotNull(jsonData);
} }
// @Test
// private void testWorkIdLastModifiedDateXMLParser() throws Exception {
// String xml = IOUtils
// .toString(
// this.getClass().getResourceAsStream("record_0000-0001-5004-5918.xml"));
// Map<String, String> workIdLastModifiedDate = XMLRecordParser.retrieveWorkIdLastModifiedDate(xml.getBytes());
// workIdLastModifiedDate.forEach((k, v) -> {
// try {
// OrcidClientTest
// .logToFile(
// k + " " + v + " isModified after " + SparkDownloadOrcidWorks.lastUpdateValue + ": "
// + SparkDownloadOrcidWorks.isModified("0000-0001-5004-5918", v));
// } catch (IOException e) {
// }
// });
// }
@Test
public void testAuthorSummaryXMLParser() throws Exception {
String xml = IOUtils
.toString(
this.getClass().getResourceAsStream("record_0000-0001-5004-5918.xml"));
AuthorSummary authorSummary = XMLRecordParser.VTDParseAuthorSummary(xml.getBytes());
authorSummary.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
OrcidClientTest.logToFile(testPath, JsonWriter.create(authorSummary));
}
@Test
public void testWorkDataXMLParser() throws Exception {
String xml = IOUtils
.toString(
this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
WorkDetail workDetail = XMLRecordParserNoDoi.VTDParseWorkData(xml.getBytes());
Work work = new Work();
work.setWorkDetail(workDetail);
work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
OrcidClientTest.logToFile(testPath, JsonWriter.create(work));
}
} }

View File

@ -21,8 +21,8 @@ import com.ximpleware.XPathParseException;
import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; import eu.dnetlib.dhp.schema.orcid.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
public class OrcidNoDoiTest { public class OrcidNoDoiTest {
@ -48,7 +48,7 @@ public class OrcidNoDoiTest {
if (p == null) { if (p == null) {
logger.info("XMLRecordParserNoDoi null"); logger.info("XMLRecordParserNoDoi null");
} }
WorkDataNoDoi workData = null; WorkDetail workData = null;
try { try {
workData = p.VTDParseWorkData(xml.getBytes()); workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) { } catch (Exception e) {
@ -105,7 +105,7 @@ public class OrcidNoDoiTest {
if (p == null) { if (p == null) {
logger.info("XMLRecordParserNoDoi null"); logger.info("XMLRecordParserNoDoi null");
} }
WorkDataNoDoi workData = null; WorkDetail workData = null;
try { try {
workData = p.VTDParseWorkData(xml.getBytes()); workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) { } catch (Exception e) {
@ -136,7 +136,7 @@ public class OrcidNoDoiTest {
if (p == null) { if (p == null) {
logger.info("XMLRecordParserNoDoi null"); logger.info("XMLRecordParserNoDoi null");
} }
WorkDataNoDoi workData = null; WorkDetail workData = null;
try { try {
workData = p.VTDParseWorkData(xml.getBytes()); workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) { } catch (Exception e) {
@ -179,7 +179,7 @@ public class OrcidNoDoiTest {
if (p == null) { if (p == null) {
logger.info("XMLRecordParserNoDoi null"); logger.info("XMLRecordParserNoDoi null");
} }
WorkDataNoDoi workData = null; WorkDetail workData = null;
try { try {
workData = p.VTDParseWorkData(xml.getBytes()); workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) { } catch (Exception e) {
@ -308,7 +308,7 @@ public class OrcidNoDoiTest {
if (p == null) { if (p == null) {
logger.info("XMLRecordParserNoDoi null"); logger.info("XMLRecordParserNoDoi null");
} }
WorkDataNoDoi workData = null; WorkDetail workData = null;
try { try {
workData = p.VTDParseWorkData(xml.getBytes()); workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) { } catch (Exception e) {

View File

@ -0,0 +1 @@
H4sIAAAAAAAAAO1c63LbNhb+n6fA6EebTE2JulpyYnXVpE2a1Jus7V5mO/0BkZCImCJVgLSidjqzf/cJ9oH2TfZJ9jsASVESLWdsddNulJlcDJxzcO4XEJMnn7+bhexaKC3j6LTWrLs1JiIv9mU0Pa19e/mV068xnfDI52EcidPaUuja58MHTxaxujqhP9g8TRwgYK/Xb/Z7TbdZY3OeBKe1hotfDn63nF6v13GOO91mg3AaK8hrqeVYhjJZntbm6TiUXo2BpUifyCgRKuLhaS1IkvlJo7FYLOqx8qSPP6eNSDdyiBxD+KnHEyPITSgFSI7jS53IyNuNVQIq8MRcCZAS/g60AibHipNAKCfiM3Ez1gomx5qJ2RgWCuT8ZqwVTKENpWK1QxO0ncN68Wy2SwF2P4eGULHaIbfdz6HnYCuGlRxfJFyG+ma8TcicwpVYLnYemAEUks+AvUNy2i5g31kfcqQvokROpNils23gnM4kjWzM3ISbARRaUWIiFEJN7FLICijH476vhN6BkwGsouhawgGdeazlbiffhMwpUMDejEW7OWSAMInV8mbgDGBlp3kYL2dQ5S5j5TA51s8pD6H62yJ9DSzH1UJdS29H8GUA6757m8cWtkGGgA7lLpOuYFbRpAVXHgV9qna47TrcikP8rMS1FItdbBZAOd44DXdYlXY3+QMBHadql/a2QGvDBwy/ntj8ceIpQdnQ8fHnsOW2UByaTtu9bLVOOv2TJqpPx/37k0YV9BqdkOvEmaFIIQLL1Jqu02pdus0T1z1xe/VOu7+iVoGzRtMybNe21x0vlPBBBP4KogyVKjkkrWioZaUSi9QYvXnjdH948bfLL1vtN98evx5dXA4KvgizkiTV0OFOVANRiRvEOhkWfBQIZnklYeNWETeUQEVp+ApZ7FPNnsZhKKaCfRNHfhxt0jKQDypOyRZN+5DIJKzQuF2+iD3JQ/aF4jJiX6W2+mLhjCepMkHNsPFXsRjHKmJfRxMeJZp9L5OAoVsx/4jThHH2FZ/JcMle2NzD4gkbpYnUM3YxF16i0hl7JjWqh1AFqyXGnjQ2WbW8v4U0VAnsxsvR2Qi8JKYhiuciytDWoUroOohVgjqnPSXnJMzwkzB5PP9kmjz+ejbHHkfSP2HfBzxhUkNShD1lZxYrxr2fU6nwb8gfiVSh97oWYTynJAkFeTCISeCa6dSDNjTjVmCdC+xnArOHo4tnj+iAKCZVTeQ7OiJNoAdxxMbQn4x0IrhPMJxdp2EkFLf9GktiLBU0odcEtkr0ERO0CONB69paEVGHVJyGlPfq7GtbPZdwJIZmh41lHMZTpOqQzYQX8AjM4jhtkEnoBVl1/XAljBI0C+P4ighBTOQeHAmtIPELWkApQ3cZkihiEithTzMeBXl0wOcgPl4SXBLxZOP8yEcoGxTxDolemjpMcobI4DjRcIVtLTLJ62wUyRmo6CT1ISn0P50KnQAIZtSp9gRsvdJehfFyy+B4JTVILAIRsamIRCK9nCWBSq3iKEMB3JVmE8sqeCnZn4foV6gZp7bFsK6XkRcAN051poisIBm9kawkqdUF/Sv2rRskKN0sgEojsKugTnAl3iGyIuuHQTrj5I0I0QQmJmduGG8u3Pr1+K2go+DVlzEZF00KSUfdrmU0slENLiercJ+twp3Yt+5kOfek8lKo3fjmhrPAl23YB6Wwv3hmQ8akjEomnwktp9ERuxAJGv7pkUklb7iC8uWcEswJMo1VhhdTCBtTG+rtXiF+xkJkebFZqJKdoxUKukOhFrAoJJ5aa1MRjSgPMDjV1Ph4wi4SdhnEM1jiRaznkuwEmWwSPmJfRtMQ5x6xVBt45gtfmgkkO6lQXk5SLxHfMxg0WZBNX6aRYK32EWu5za4Vf5ROU/hw06z160hza1IiaShNqWyqhADPIScj203S+MPzzx4ZOmRoG4V5JIfC5BBKTiSvDSIDu6bJSgU+PHcesQUo4khPpSY3ZjFgbVJnFyVfp1CD7GVnt3pQYmpCJZTRFUiAn8zHch9kC07Gns05Um6Vz5wRmdc2Z1ruzwTXKax3ws4z6vhhjr8pFxkut84gQbQIESG5Bxetv82zZjbWAXZnGI4cjthYaqlzzbKQ0shmhBfiEkVwKbgXZBIbsVINelQfQNSwbLJb7JVYswUlEiXF8YwEtuCJMSUn2slZqrPnKk7nJudnw8sR0UgUOgZyOaMA8Q7ehfYBLj2WKgmKn7THI+t4U0Pm3/8yO2bW54YlkDP6yvNPlVHOhUa1gQUuoZuJJF7R8qFciYR4AZummE5Ys8/OPwN12z48bLYRf6F4DIX4EhntR8WjqfjJVAjkW41SR25UZrXTqg/a7MeOW3ddp9Op93s/gT9xpa3b0wHOfQ/ouuzH9qDeGtAB3X5+QDkYg9hqBdIEqNeUx8z4EyUmaqaUZo2TbNWBzQqgAJwYhqgAKLiClrDZjD1M/vOPf57id6ve6T9mb7Kf0LVbUUMxAR4Kl7B9CKVNsFagteuD3jpandIpJlZTr45sijCeycsC3OgJuV8T1zzK2NViSpXRNCQmMCami0lDXubEbVcI4ME9AZeIEvNWGzn1E1Yi4ZZJgJ45ahuyVe83NyA3VFyGPT6uoloJ2u2ugVptrrz56DZ7+4JGLMoBMRX19oBSTadrnevTbZc8onpNGNXkstNklFOFZUqub84w6RmzQdZcVIXu0zjywlTbBgZGOUdavLbt8EWl1+q8GfSZj2kKGWa9aVilMkRClsxMQTTtOvLVJdVzW8gncWoSKrXdRatguxvoM+DXtqzeUvOMB290JFshuDvPkuT+Uq9LYlx/JYG6obrMVQzXNR2APdWx3X5WdWAQRLMhWtJ/NrFsDyalqcVDv7Fa2153kuVcDMdynIh3Gb31rZvwrnmYiuFfTKMVil87/nG33ez1B72+3/EHYtxqdwb+2D9u9pu+N3aPQMeMVIbWKat9gGGxRkzwMaIDnmiYOAxuh8Htzz64/fGmtMNIdhjJdo5kh/nrQ89fh2HrMGwdhq0//rB1mKz+h5OVnQ9S1EqVDSkv0Vsm7KnkSqF6c8PIS8ooaFzZ60/PoGgvQCuccJC2BuIhYhIjx0wie19blGd8gj6XfUGdQyjM0jeph940Zk8NN7HzHHnOt1ujCBxES/ZGIcLMypczMPwiBffWCy4SIaOFQGf168sYrERYfxXyVP+WcUhrnL1C6uQ6o0Bl/41QympztBRoydlLfk3lDAvfhdwHz4qDeIwKFIiM93MevYUORldxKK64sudTqQ7Yd9JLYpUdqcU8YC/4WzKekVl4aKLYWarmwTLTwrUEJ/6CK99ydYlaeCXZCIIG0qw8p3YCzdOZNwqpbTMmWULDLJ8b0T4NzOoM9THIVvlc0ZIfS1YANt1603Wbjbcc/mrdmz7z1YlAvdnv9Q0V8DhNKW0SCjV+6BjMxnUcpjORH2qWsk+DmWtsfj80IFLraMVq97jjtPtu12zl7YiirREsSrkbjY9vhrFRFiH08oGgo5QeB2WEOlj6bXM6twN4+Yvn+qyffbClGT7/ppkN6/kH0mK8L75fm9dclvzqc3sZgkwxJA0WH17NyhacMc7Q7RRgdmELzufLodstoOjH9U/Q1Szl6KXXPXqbeGm3+pt7CcBedmSfwkk9WCuY2IK7lZo1Tn4p4tCtiEPXIg7dizjli5HKQ0q23XVKRKkrlL9Qy438oaV5l4N6JGp3P3tF9HYGbLZHug3kfIhmfFJJcQ1q+y1DpZnubsP5bA+Wa7uDbrPZ6/xe1tlJ/89uAbEHA7Qc3aq7Tr/r9jrtVrvd7f5epnjPk/7sRkFtvLdRbi2pv5eN7nbwhzdZ1Y5eL2GpCotnaFdeOEdrVcffde7V06uGuZ4OGyJqlAqhbtjm1TGXL86qa3ZWHbKDjaxjd7IJw6HW20GX5WT3QQ537H2Qk90HOfHEsffXTn7X7OS3pA/fp6A8qgfJLCw9lAvXvkXQjYYcpziqXK0396qNVQJwzDO5dbB1ldqXfWsP+/KH7U3neNBpOt1W2y3xKW+mZp7s7cKueNPXeD+mM9ExrMnEvr/bHDjO4uiXOH+aVgasolM6jCf2n0JXCLYFrdDbD+3gkx+1ubsh33sduA32wazecvpuu+30Bt0dzzhvtHoV9l6tftNIeTD8/Q3fG7htRO3gLuFehb1Pw2/eFhzsfV97t52WOzh2BseDH+5g7yrsfdp7/SLoI7T2lsDV92AHzYjh2jXgQSFiWLoF/QjVsfe62G73eo47aLfuVBe3sffaELFXxSX3R2jrigaxfKN/0Aglg+KDxkeojr3PxL1O59jptbp3aZqqsPeZDMrfqj5CW28JXPWp7qAXGqbWvlR+hCrZe4/QbTc7znGv1btTj7CNvc+0sPYR+mDs+xu71Ru4Trcz6N7J2NvY+70hK70vOBh7D7di+f/ucrdbsS3svd6S2Kcjz7PHIwdz3/9SrNOnTxdu7y6JvAp7r/1ddtGx9j7oYPQ9TPjdrus00ZzfbcLfwt6n0deefh2MfX9jdzq9ntNqd9p3MvY29j6Nvfmq7//M3tvrG9/480eG5j9dG4rVf72yvvEgI0R/DB/8F4+Tql7oTQAA

View File

@ -732,7 +732,7 @@
<common:external-id-relationship>part-of</common:external-id-relationship> <common:external-id-relationship>part-of</common:external-id-relationship>
</common:external-id> </common:external-id>
</common:external-ids> </common:external-ids>
<work:work-summary put-code="0" visibility="private"> <work:work-summary put-code="123456" visibility="private">
<common:created-date>2001-12-31T12:00:00</common:created-date> <common:created-date>2001-12-31T12:00:00</common:created-date>
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date> <common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
<common:source> <common:source>