forked from D-Net/dnet-hadoop
[ORCID-no-doi] integrating PR#98 D-Net/dnet-hadoop#98
This commit is contained in:
parent
70e49ed53c
commit
ee34cc51c3
|
@ -0,0 +1,31 @@
|
||||||
|
diff a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java (rejected hunks)
|
||||||
|
@@ -1,8 +1,6 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
-import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
-
|
||||||
|
import static com.google.common.base.Preconditions.checkArgument;
|
||||||
|
|
||||||
|
import java.text.ParseException;
|
||||||
|
@@ -10,6 +8,8 @@ import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
+import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
+
|
||||||
|
/**
|
||||||
|
* Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to
|
||||||
|
* graph node identifiers and it is further characterised by the semantic of the link through the fields relType,
|
||||||
|
@@ -137,7 +137,10 @@ public class Relation extends Oaf {
|
||||||
|
try {
|
||||||
|
setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate()));
|
||||||
|
} catch (ParseException e) {
|
||||||
|
- throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate()));
|
||||||
|
+ throw new IllegalArgumentException(String
|
||||||
|
+ .format(
|
||||||
|
+ "invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(),
|
||||||
|
+ getValidationDate()));
|
||||||
|
}
|
||||||
|
|
||||||
|
super.mergeFrom(r);
|
|
@ -0,0 +1,79 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class AuthorHistory implements Serializable {
|
||||||
|
private String creationMethod;
|
||||||
|
private String completionDate;
|
||||||
|
private String submissionDate;
|
||||||
|
private String lastModifiedDate;
|
||||||
|
private boolean claimed;
|
||||||
|
private String deactivationDate;
|
||||||
|
private boolean verifiedEmail;
|
||||||
|
private boolean verifiedPrimaryEmail;
|
||||||
|
|
||||||
|
public String getCreationMethod() {
|
||||||
|
return creationMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCreationMethod(String creationMethod) {
|
||||||
|
this.creationMethod = creationMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCompletionDate() {
|
||||||
|
return completionDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCompletionDate(String completionDate) {
|
||||||
|
this.completionDate = completionDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSubmissionDate() {
|
||||||
|
return submissionDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSubmissionDate(String submissionDate) {
|
||||||
|
this.submissionDate = submissionDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLastModifiedDate() {
|
||||||
|
return lastModifiedDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLastModifiedDate(String lastModifiedDate) {
|
||||||
|
this.lastModifiedDate = lastModifiedDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isClaimed() {
|
||||||
|
return claimed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setClaimed(boolean claimed) {
|
||||||
|
this.claimed = claimed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDeactivationDate() {
|
||||||
|
return deactivationDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDeactivationDate(String deactivationDate) {
|
||||||
|
this.deactivationDate = deactivationDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isVerifiedEmail() {
|
||||||
|
return verifiedEmail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVerifiedEmail(boolean verifiedEmail) {
|
||||||
|
this.verifiedEmail = verifiedEmail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isVerifiedPrimaryEmail() {
|
||||||
|
return verifiedPrimaryEmail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVerifiedPrimaryEmail(boolean verifiedPrimaryEmail) {
|
||||||
|
this.verifiedPrimaryEmail = verifiedPrimaryEmail;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class AuthorSummary extends OrcidData implements Serializable {
|
||||||
|
private AuthorData authorData;
|
||||||
|
private AuthorHistory authorHistory;
|
||||||
|
|
||||||
|
public AuthorData getAuthorData() {
|
||||||
|
return authorData;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAuthorData(AuthorData authorData) {
|
||||||
|
this.authorData = authorData;
|
||||||
|
}
|
||||||
|
|
||||||
|
public AuthorHistory getAuthorHistory() {
|
||||||
|
return authorHistory;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAuthorHistory(AuthorHistory authorHistory) {
|
||||||
|
this.authorHistory = authorHistory;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@ -12,9 +12,9 @@ import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
public class Contributor extends AuthorData implements Serializable {
|
public class Contributor extends AuthorData implements Serializable {
|
||||||
private String sequence;
|
private String sequence;
|
||||||
private String role;
|
private String role;
|
||||||
private transient boolean simpleMatch = false;
|
private transient boolean simpleMatch;
|
||||||
private transient Double score = 0.0;
|
private transient Double score;
|
||||||
private transient boolean bestMatch = false;
|
private transient boolean bestMatch;
|
||||||
|
|
||||||
public String getSequence() {
|
public String getSequence() {
|
||||||
return sequence;
|
return sequence;
|
|
@ -1,11 +1,13 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class models the data related to external id, that are retrieved from an orcid publication
|
* This class models the data related to external id, that are retrieved from an orcid publication
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class ExternalId {
|
public class ExternalId implements Serializable {
|
||||||
private String type;
|
private String type;
|
||||||
private String value;
|
private String value;
|
||||||
private String relationShip;
|
private String relationShip;
|
|
@ -0,0 +1,34 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class OrcidData implements Serializable {
|
||||||
|
protected String base64CompressData;
|
||||||
|
protected String statusCode;
|
||||||
|
protected String downloadDate;
|
||||||
|
|
||||||
|
public String getBase64CompressData() {
|
||||||
|
return base64CompressData;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBase64CompressData(String base64CompressData) {
|
||||||
|
this.base64CompressData = base64CompressData;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getStatusCode() {
|
||||||
|
return statusCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStatusCode(String statusCode) {
|
||||||
|
this.statusCode = statusCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDownloadDate() {
|
||||||
|
return downloadDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDownloadDate(String downloadDate) {
|
||||||
|
this.downloadDate = downloadDate;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,11 +1,13 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class models the data related to a publication date, that are retrieved from an orcid publication
|
* This class models the data related to a publication date, that are retrieved from an orcid publication
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class PublicationDate {
|
public class PublicationDate implements Serializable {
|
||||||
private String year;
|
private String year;
|
||||||
private String month;
|
private String month;
|
||||||
private String day;
|
private String day;
|
|
@ -0,0 +1,79 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class Summary implements Serializable {
|
||||||
|
private String creationMethod;
|
||||||
|
private String completionDate;
|
||||||
|
private String submissionDate;
|
||||||
|
private String lastModifiedDate;
|
||||||
|
private boolean claimed;
|
||||||
|
private String deactivationDate;
|
||||||
|
private boolean verifiedEmail;
|
||||||
|
private boolean verifiedPrimaryEmail;
|
||||||
|
|
||||||
|
public String getCreationMethod() {
|
||||||
|
return creationMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCreationMethod(String creationMethod) {
|
||||||
|
this.creationMethod = creationMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCompletionDate() {
|
||||||
|
return completionDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCompletionDate(String completionDate) {
|
||||||
|
this.completionDate = completionDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSubmissionDate() {
|
||||||
|
return submissionDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSubmissionDate(String submissionDate) {
|
||||||
|
this.submissionDate = submissionDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLastModifiedDate() {
|
||||||
|
return lastModifiedDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLastModifiedDate(String lastModifiedDate) {
|
||||||
|
this.lastModifiedDate = lastModifiedDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isClaimed() {
|
||||||
|
return claimed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setClaimed(boolean claimed) {
|
||||||
|
this.claimed = claimed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDeactivationDate() {
|
||||||
|
return deactivationDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDeactivationDate(String deactivationDate) {
|
||||||
|
this.deactivationDate = deactivationDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isVerifiedEmail() {
|
||||||
|
return verifiedEmail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVerifiedEmail(boolean verifiedEmail) {
|
||||||
|
this.verifiedEmail = verifiedEmail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isVerifiedPrimaryEmail() {
|
||||||
|
return verifiedPrimaryEmail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVerifiedPrimaryEmail(boolean verifiedPrimaryEmail) {
|
||||||
|
this.verifiedPrimaryEmail = verifiedPrimaryEmail;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,16 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class Work extends OrcidData implements Serializable {
|
||||||
|
WorkDetail workDetail;
|
||||||
|
|
||||||
|
public WorkDetail getWorkDetail() {
|
||||||
|
return workDetail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWorkDetail(WorkDetail workDetail) {
|
||||||
|
this.workDetail = workDetail;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,14 +1,19 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.Contributor;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.ExternalId;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.OrcidData;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.PublicationDate;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class models the data that are retrieved from orcid publication
|
* This class models the data that are retrieved from orcid publication
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class WorkDataNoDoi implements Serializable {
|
public class WorkDetail implements Serializable {
|
||||||
|
|
||||||
private String oid;
|
private String oid;
|
||||||
private String id;
|
private String id;
|
|
@ -1,208 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
|
||||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
|
||||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hadoop.io.SequenceFile;
|
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
|
||||||
import org.apache.http.client.methods.HttpGet;
|
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
|
||||||
import org.apache.http.impl.client.HttpClients;
|
|
||||||
import org.mortbay.log.Log;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
|
|
||||||
public class OrcidDownloader extends OrcidDSManager {
|
|
||||||
|
|
||||||
static final int REQ_LIMIT = 24;
|
|
||||||
static final int REQ_MAX_TEST = -1;
|
|
||||||
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 500;
|
|
||||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
|
||||||
static final String lastUpdate = "2020-09-29 00:00:00";
|
|
||||||
private String lambdaFileName;
|
|
||||||
private String outputPath;
|
|
||||||
private String token;
|
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException, Exception {
|
|
||||||
OrcidDownloader orcidDownloader = new OrcidDownloader();
|
|
||||||
orcidDownloader.loadArgs(args);
|
|
||||||
orcidDownloader.parseLambdaFile();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String downloadRecord(String orcidId) throws IOException {
|
|
||||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
|
||||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
|
||||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
|
||||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
|
||||||
CloseableHttpResponse response = client.execute(httpGet);
|
|
||||||
if (response.getStatusLine().getStatusCode() != 200) {
|
|
||||||
Log
|
|
||||||
.info(
|
|
||||||
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
|
||||||
return new String("");
|
|
||||||
}
|
|
||||||
// return IOUtils.toString(response.getEntity().getContent());
|
|
||||||
return xmlStreamToString(response.getEntity().getContent());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String xmlStreamToString(InputStream xmlStream) throws IOException {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(xmlStream));
|
|
||||||
String line;
|
|
||||||
StringBuffer buffer = new StringBuffer();
|
|
||||||
while ((line = br.readLine()) != null) {
|
|
||||||
buffer.append(line);
|
|
||||||
}
|
|
||||||
return buffer.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void parseLambdaFile() throws Exception {
|
|
||||||
int parsedRecordsCounter = 0;
|
|
||||||
int downloadedRecordsCounter = 0;
|
|
||||||
int savedRecordsCounter = 0;
|
|
||||||
long startDownload = 0;
|
|
||||||
Configuration conf = initConfigurationObject();
|
|
||||||
FileSystem fs = initFileSystemObject(conf);
|
|
||||||
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
|
|
||||||
Path hdfsreadpath = new Path(lambdaFileUri);
|
|
||||||
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
|
||||||
Path hdfsoutputPath = new Path(
|
|
||||||
hdfsServerUri
|
|
||||||
.concat(workingPath)
|
|
||||||
.concat(outputPath)
|
|
||||||
.concat("updated_xml_authors.seq"));
|
|
||||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
|
||||||
new GzipCompressorInputStream(lambdaFileStream))) {
|
|
||||||
TarArchiveEntry entry = null;
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
try (SequenceFile.Writer writer = SequenceFile
|
|
||||||
.createWriter(
|
|
||||||
conf,
|
|
||||||
SequenceFile.Writer.file(hdfsoutputPath),
|
|
||||||
SequenceFile.Writer.keyClass(Text.class),
|
|
||||||
SequenceFile.Writer.valueClass(Text.class),
|
|
||||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
|
|
||||||
startDownload = System.currentTimeMillis();
|
|
||||||
while ((entry = tais.getNextTarEntry()) != null) {
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
|
|
||||||
String line;
|
|
||||||
while ((line = br.readLine()) != null) {
|
|
||||||
String[] values = line.split(",");
|
|
||||||
List<String> recordInfo = Arrays.asList(values);
|
|
||||||
int nReqTmp = 0;
|
|
||||||
long startReqTmp = System.currentTimeMillis();
|
|
||||||
// skip headers line
|
|
||||||
if (parsedRecordsCounter == 0) {
|
|
||||||
parsedRecordsCounter++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
parsedRecordsCounter++;
|
|
||||||
String orcidId = recordInfo.get(0);
|
|
||||||
if (isModified(orcidId, recordInfo.get(3))) {
|
|
||||||
String record = downloadRecord(orcidId);
|
|
||||||
downloadedRecordsCounter++;
|
|
||||||
if (!record.isEmpty()) {
|
|
||||||
// String compressRecord = ArgumentApplicationParser.compressArgument(record);
|
|
||||||
final Text key = new Text(recordInfo.get(0));
|
|
||||||
final Text value = new Text(record);
|
|
||||||
writer.append(key, value);
|
|
||||||
savedRecordsCounter++;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
long endReq = System.currentTimeMillis();
|
|
||||||
nReqTmp++;
|
|
||||||
if (nReqTmp == REQ_LIMIT) {
|
|
||||||
long reqSessionDuration = endReq - startReqTmp;
|
|
||||||
if (reqSessionDuration <= 1000) {
|
|
||||||
Log
|
|
||||||
.info(
|
|
||||||
"\nreqSessionDuration: "
|
|
||||||
+ reqSessionDuration
|
|
||||||
+ " nReqTmp: "
|
|
||||||
+ nReqTmp
|
|
||||||
+ " wait ....");
|
|
||||||
Thread.sleep(1000 - reqSessionDuration);
|
|
||||||
} else {
|
|
||||||
nReqTmp = 0;
|
|
||||||
startReqTmp = System.currentTimeMillis();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
|
||||||
Log
|
|
||||||
.info(
|
|
||||||
"Current parsed: "
|
|
||||||
+ parsedRecordsCounter
|
|
||||||
+ " downloaded: "
|
|
||||||
+ downloadedRecordsCounter
|
|
||||||
+ " saved: "
|
|
||||||
+ savedRecordsCounter);
|
|
||||||
if (REQ_MAX_TEST != -1 && parsedRecordsCounter > REQ_MAX_TEST) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
long endDownload = System.currentTimeMillis();
|
|
||||||
long downloadTime = endDownload - startDownload;
|
|
||||||
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Log.info("Download started at: " + new Date(startDownload).toString());
|
|
||||||
Log.info("Download ended at: " + new Date(System.currentTimeMillis()).toString());
|
|
||||||
Log.info("Parsed Records Counter: " + parsedRecordsCounter);
|
|
||||||
Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
|
|
||||||
Log.info("Saved Records Counter: " + savedRecordsCounter);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void loadArgs(String[] args) throws IOException, Exception {
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
OrcidDownloader.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
hdfsServerUri = parser.get("hdfsServerUri");
|
|
||||||
Log.info("HDFS URI: " + hdfsServerUri);
|
|
||||||
workingPath = parser.get("workingPath");
|
|
||||||
Log.info("Default Path: " + workingPath);
|
|
||||||
lambdaFileName = parser.get("lambdaFileName");
|
|
||||||
Log.info("Lambda File Name: " + lambdaFileName);
|
|
||||||
outputPath = parser.get("outputPath");
|
|
||||||
Log.info("Output Data: " + outputPath);
|
|
||||||
token = parser.get("token");
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isModified(String orcidId, String modifiedDate) {
|
|
||||||
Date modifiedDateDt = null;
|
|
||||||
Date lastUpdateDt = null;
|
|
||||||
try {
|
|
||||||
if (modifiedDate.length() != 19) {
|
|
||||||
modifiedDate = modifiedDate.substring(0, 19);
|
|
||||||
}
|
|
||||||
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
|
||||||
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
|
||||||
} catch (Exception e) {
|
|
||||||
Log.info("[" + orcidId + "] Parsing date: ", e.getMessage());
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return modifiedDateDt.after(lastUpdateDt);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -8,6 +8,7 @@ import java.util.Date;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
@ -24,13 +25,13 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
||||||
|
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class SparkDownloadOrcidAuthors {
|
public class SparkDownloadOrcidAuthors {
|
||||||
|
|
||||||
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
|
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
|
||||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||||
static final String lastUpdate = "2020-09-29 00:00:00";
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -53,18 +54,25 @@ public class SparkDownloadOrcidAuthors {
|
||||||
final String token = parser.get("token");
|
final String token = parser.get("token");
|
||||||
final String lambdaFileName = parser.get("lambdaFileName");
|
final String lambdaFileName = parser.get("lambdaFileName");
|
||||||
logger.info("lambdaFileName: {}", lambdaFileName);
|
logger.info("lambdaFileName: {}", lambdaFileName);
|
||||||
|
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||||
|
logger.info("lastUpdate: {}", lastUpdate);
|
||||||
|
if (StringUtils.isBlank(lastUpdate)) {
|
||||||
|
throw new RuntimeException("last update info not found");
|
||||||
|
}
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
|
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
|
||||||
LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records");
|
LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records");
|
||||||
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
||||||
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
|
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
|
||||||
|
LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404");
|
||||||
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
|
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
|
||||||
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
|
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
|
||||||
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
|
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
|
||||||
|
@ -73,13 +81,14 @@ public class SparkDownloadOrcidAuthors {
|
||||||
logger.info("Retrieving data from lamda sequence file");
|
logger.info("Retrieving data from lamda sequence file");
|
||||||
JavaPairRDD<Text, Text> lamdaFileRDD = sc
|
JavaPairRDD<Text, Text> lamdaFileRDD = sc
|
||||||
.sequenceFile(workingPath + lambdaFileName, Text.class, Text.class);
|
.sequenceFile(workingPath + lambdaFileName, Text.class, Text.class);
|
||||||
logger.info("Data retrieved: " + lamdaFileRDD.count());
|
final long lamdaFileRDDCount = lamdaFileRDD.count();
|
||||||
|
logger.info("Data retrieved: " + lamdaFileRDDCount);
|
||||||
|
|
||||||
Function<Tuple2<Text, Text>, Boolean> isModifiedAfterFilter = data -> {
|
Function<Tuple2<Text, Text>, Boolean> isModifiedAfterFilter = data -> {
|
||||||
String orcidId = data._1().toString();
|
String orcidId = data._1().toString();
|
||||||
String lastModifiedDate = data._2().toString();
|
String lastModifiedDate = data._2().toString();
|
||||||
parsedRecordsAcc.add(1);
|
parsedRecordsAcc.add(1);
|
||||||
if (isModified(orcidId, lastModifiedDate)) {
|
if (isModified(orcidId, lastModifiedDate, lastUpdate)) {
|
||||||
modifiedRecordsAcc.add(1);
|
modifiedRecordsAcc.add(1);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -92,7 +101,7 @@ public class SparkDownloadOrcidAuthors {
|
||||||
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
||||||
downloaded.setOrcidId(orcidId);
|
downloaded.setOrcidId(orcidId);
|
||||||
downloaded.setLastModifiedDate(lastModifiedDate);
|
downloaded.setLastModifiedDate(lastModifiedDate);
|
||||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
CloseableHttpClient client = HttpClients.createDefault();
|
||||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||||
|
@ -109,19 +118,16 @@ public class SparkDownloadOrcidAuthors {
|
||||||
switch (statusCode) {
|
switch (statusCode) {
|
||||||
case 403:
|
case 403:
|
||||||
errorHTTP403Acc.add(1);
|
errorHTTP403Acc.add(1);
|
||||||
|
case 404:
|
||||||
|
errorHTTP404Acc.add(1);
|
||||||
case 409:
|
case 409:
|
||||||
errorHTTP409Acc.add(1);
|
errorHTTP409Acc.add(1);
|
||||||
case 503:
|
case 503:
|
||||||
errorHTTP503Acc.add(1);
|
errorHTTP503Acc.add(1);
|
||||||
throw new RuntimeException("Orcid request rate limit reached (HTTP 503)");
|
|
||||||
case 525:
|
case 525:
|
||||||
errorHTTP525Acc.add(1);
|
errorHTTP525Acc.add(1);
|
||||||
default:
|
default:
|
||||||
errorHTTPGenericAcc.add(1);
|
errorHTTPGenericAcc.add(1);
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
"Downloading " + orcidId + " status code: "
|
|
||||||
+ response.getStatusLine().getStatusCode());
|
|
||||||
}
|
}
|
||||||
return downloaded.toTuple2();
|
return downloaded.toTuple2();
|
||||||
}
|
}
|
||||||
|
@ -130,11 +136,7 @@ public class SparkDownloadOrcidAuthors {
|
||||||
.setCompressedData(
|
.setCompressedData(
|
||||||
ArgumentApplicationParser
|
ArgumentApplicationParser
|
||||||
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
||||||
} catch (Throwable e) {
|
client.close();
|
||||||
logger.info("Downloading " + orcidId, e.getMessage());
|
|
||||||
downloaded.setErrorMessage(e.getMessage());
|
|
||||||
return downloaded.toTuple2();
|
|
||||||
}
|
|
||||||
return downloaded.toTuple2();
|
return downloaded.toTuple2();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -142,10 +144,12 @@ public class SparkDownloadOrcidAuthors {
|
||||||
|
|
||||||
logger.info("Start execution ...");
|
logger.info("Start execution ...");
|
||||||
JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter);
|
JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter);
|
||||||
logger.info("Authors modified count: " + authorsModifiedRDD.count());
|
long authorsModifiedCount = authorsModifiedRDD.count();
|
||||||
|
logger.info("Authors modified count: " + authorsModifiedCount);
|
||||||
|
|
||||||
logger.info("Start downloading ...");
|
logger.info("Start downloading ...");
|
||||||
authorsModifiedRDD
|
authorsModifiedRDD
|
||||||
.repartition(10)
|
.repartition(100)
|
||||||
.map(downloadRecordFunction)
|
.map(downloadRecordFunction)
|
||||||
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
|
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
|
||||||
.saveAsNewAPIHadoopFile(
|
.saveAsNewAPIHadoopFile(
|
||||||
|
@ -154,10 +158,12 @@ public class SparkDownloadOrcidAuthors {
|
||||||
Text.class,
|
Text.class,
|
||||||
SequenceFileOutputFormat.class,
|
SequenceFileOutputFormat.class,
|
||||||
sc.hadoopConfiguration());
|
sc.hadoopConfiguration());
|
||||||
|
|
||||||
logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString());
|
logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString());
|
||||||
logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString());
|
logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString());
|
||||||
logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
|
logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
|
||||||
logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
|
logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
|
||||||
|
logger.info("errorHTTP404Acc: " + errorHTTP404Acc.value().toString());
|
||||||
logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
|
logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
|
||||||
logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
|
logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
|
||||||
logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
|
logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
|
||||||
|
@ -166,18 +172,27 @@ public class SparkDownloadOrcidAuthors {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean isModified(String orcidId, String modifiedDate) {
|
public static boolean isModified(String orcidId, String modifiedDate, String lastUpdate) {
|
||||||
Date modifiedDateDt;
|
Date modifiedDateDt;
|
||||||
Date lastUpdateDt;
|
Date lastUpdateDt;
|
||||||
|
String lastUpdateRedux = "";
|
||||||
try {
|
try {
|
||||||
|
if (modifiedDate.equals("last_modified")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if (modifiedDate.length() != 19) {
|
if (modifiedDate.length() != 19) {
|
||||||
modifiedDate = modifiedDate.substring(0, 19);
|
modifiedDate = modifiedDate.substring(0, 19);
|
||||||
}
|
}
|
||||||
|
if (lastUpdate.length() != 19) {
|
||||||
|
lastUpdateRedux = lastUpdate.substring(0, 19);
|
||||||
|
} else {
|
||||||
|
lastUpdateRedux = lastUpdate;
|
||||||
|
}
|
||||||
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
||||||
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdateRedux);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.info("[" + orcidId + "] Parsing date: ", e.getMessage());
|
throw new RuntimeException("[" + orcidId + "] modifiedDate <" + modifiedDate + "> lastUpdate <" + lastUpdate
|
||||||
return true;
|
+ "> Parsing date: " + e.getMessage());
|
||||||
}
|
}
|
||||||
return modifiedDateDt.after(lastUpdateDt);
|
return modifiedDateDt.after(lastUpdateDt);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java (rejected hunks)
|
||||||
|
@@ -31,7 +32,6 @@ public class SparkDownloadOrcidAuthors {
|
||||||
|
|
||||||
|
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
|
||||||
|
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||||
|
- static String lastUpdate;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
@@ -54,14 +54,18 @@ public class SparkDownloadOrcidAuthors {
|
||||||
|
final String token = parser.get("token");
|
||||||
|
final String lambdaFileName = parser.get("lambdaFileName");
|
||||||
|
logger.info("lambdaFileName: {}", lambdaFileName);
|
||||||
|
-
|
||||||
|
- lastUpdate = HDFSUtil.readFromTextFile(workingPath.concat("last_update.txt"));
|
||||||
|
+ final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
+ String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||||
|
+ logger.info("lastUpdate: ", lastUpdate);
|
||||||
|
+ if (StringUtils.isBlank(lastUpdate)) {
|
||||||
|
+ throw new RuntimeException("last update info not found");
|
||||||
|
+ }
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
|
|
@ -0,0 +1,251 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpGet;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
||||||
|
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||||
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkDownloadOrcidWorks {
|
||||||
|
|
||||||
|
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidWorks.class);
|
||||||
|
public static final String LAMBDA_FILE_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||||
|
public static final DateTimeFormatter LAMBDA_FILE_DATE_FORMATTER = DateTimeFormatter
|
||||||
|
.ofPattern(LAMBDA_FILE_DATE_FORMAT);
|
||||||
|
public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
|
||||||
|
public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter
|
||||||
|
.ofPattern(ORCID_XML_DATETIME_FORMAT);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkDownloadOrcidWorks.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
logger.info("workingPath: ", workingPath);
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
final String token = parser.get("token");
|
||||||
|
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||||
|
logger.info("lastUpdateValue: ", lastUpdateValue);
|
||||||
|
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors");
|
||||||
|
LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors");
|
||||||
|
LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works");
|
||||||
|
LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works");
|
||||||
|
LongAccumulator maxModifiedWorksLimitAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("max_modified_works_limit");
|
||||||
|
LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found");
|
||||||
|
LongAccumulator errorLoadingJsonFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_loading_json_found");
|
||||||
|
LongAccumulator errorLoadingXMLFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_loading_xml_found");
|
||||||
|
LongAccumulator errorParsingXMLFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_parsing_xml_found");
|
||||||
|
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
||||||
|
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
|
||||||
|
LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404");
|
||||||
|
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
|
||||||
|
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
|
||||||
|
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
|
||||||
|
LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
|
||||||
|
|
||||||
|
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
|
||||||
|
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
|
||||||
|
updatedAuthorsAcc.setValue(updatedAuthorsRDD.count());
|
||||||
|
|
||||||
|
FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> {
|
||||||
|
String orcidId = data._1().toString();
|
||||||
|
String jsonData = data._2().toString();
|
||||||
|
List<String> workIds = new ArrayList<>();
|
||||||
|
Map<String, String> workIdLastModifiedDate = new HashMap<>();
|
||||||
|
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||||
|
String statusCode = getJsonValue(jElement, "statusCode");
|
||||||
|
if (statusCode.equals("200")) {
|
||||||
|
String compressedData = getJsonValue(jElement, "compressedData");
|
||||||
|
if (StringUtils.isEmpty(compressedData)) {
|
||||||
|
errorLoadingJsonFoundAcc.add(1);
|
||||||
|
} else {
|
||||||
|
String authorSummary = ArgumentApplicationParser.decompressValue(compressedData);
|
||||||
|
if (StringUtils.isEmpty(authorSummary)) {
|
||||||
|
errorLoadingXMLFoundAcc.add(1);
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
workIdLastModifiedDate = XMLRecordParser
|
||||||
|
.retrieveWorkIdLastModifiedDate(authorSummary.getBytes());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing " + orcidId + " [" + jsonData + "]", e);
|
||||||
|
errorParsingXMLFoundAcc.add(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
errorCodeFoundAcc.add(1);
|
||||||
|
}
|
||||||
|
parsedAuthorsAcc.add(1);
|
||||||
|
workIdLastModifiedDate.forEach((k, v) -> {
|
||||||
|
parsedWorksAcc.add(1);
|
||||||
|
if (isModified(orcidId, v, lastUpdateValue)) {
|
||||||
|
modifiedWorksAcc.add(1);
|
||||||
|
workIds.add(orcidId.concat("/work/").concat(k));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (workIdLastModifiedDate.size() > 50) {
|
||||||
|
maxModifiedWorksLimitAcc.add(1);
|
||||||
|
}
|
||||||
|
return workIds.iterator();
|
||||||
|
};
|
||||||
|
|
||||||
|
Function<String, Tuple2<String, String>> downloadWorkFunction = data -> {
|
||||||
|
String relativeWorkUrl = data;
|
||||||
|
String orcidId = relativeWorkUrl.split("/")[0];
|
||||||
|
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
||||||
|
downloaded.setOrcidId(orcidId);
|
||||||
|
downloaded.setLastModifiedDate(lastUpdateValue);
|
||||||
|
CloseableHttpClient client = HttpClients.createDefault();
|
||||||
|
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + relativeWorkUrl);
|
||||||
|
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||||
|
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||||
|
long startReq = System.currentTimeMillis();
|
||||||
|
CloseableHttpResponse response = client.execute(httpGet);
|
||||||
|
long endReq = System.currentTimeMillis();
|
||||||
|
long reqTime = endReq - startReq;
|
||||||
|
if (reqTime < 1000) {
|
||||||
|
Thread.sleep(1000 - reqTime);
|
||||||
|
}
|
||||||
|
int statusCode = response.getStatusLine().getStatusCode();
|
||||||
|
downloaded.setStatusCode(statusCode);
|
||||||
|
if (statusCode != 200) {
|
||||||
|
switch (statusCode) {
|
||||||
|
case 403:
|
||||||
|
errorHTTP403Acc.add(1);
|
||||||
|
case 404:
|
||||||
|
errorHTTP404Acc.add(1);
|
||||||
|
case 409:
|
||||||
|
errorHTTP409Acc.add(1);
|
||||||
|
case 503:
|
||||||
|
errorHTTP503Acc.add(1);
|
||||||
|
case 525:
|
||||||
|
errorHTTP525Acc.add(1);
|
||||||
|
default:
|
||||||
|
errorHTTPGenericAcc.add(1);
|
||||||
|
logger
|
||||||
|
.info(
|
||||||
|
"Downloading " + orcidId + " status code: "
|
||||||
|
+ response.getStatusLine().getStatusCode());
|
||||||
|
}
|
||||||
|
return downloaded.toTuple2();
|
||||||
|
}
|
||||||
|
downloadedRecordsAcc.add(1);
|
||||||
|
downloaded
|
||||||
|
.setCompressedData(
|
||||||
|
ArgumentApplicationParser
|
||||||
|
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
||||||
|
client.close();
|
||||||
|
return downloaded.toTuple2();
|
||||||
|
};
|
||||||
|
|
||||||
|
updatedAuthorsRDD
|
||||||
|
.flatMap(retrieveWorkUrlFunction)
|
||||||
|
.repartition(100)
|
||||||
|
.map(downloadWorkFunction)
|
||||||
|
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
|
||||||
|
.saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class);
|
||||||
|
|
||||||
|
logger.info("updatedAuthorsAcc: " + updatedAuthorsAcc.value().toString());
|
||||||
|
logger.info("parsedAuthorsAcc: " + parsedAuthorsAcc.value().toString());
|
||||||
|
logger.info("parsedWorksAcc: " + parsedWorksAcc.value().toString());
|
||||||
|
logger.info("modifiedWorksAcc: " + modifiedWorksAcc.value().toString());
|
||||||
|
logger.info("maxModifiedWorksLimitAcc: " + maxModifiedWorksLimitAcc.value().toString());
|
||||||
|
logger.info("errorCodeFoundAcc: " + errorCodeFoundAcc.value().toString());
|
||||||
|
logger.info("errorLoadingJsonFoundAcc: " + errorLoadingJsonFoundAcc.value().toString());
|
||||||
|
logger.info("errorLoadingXMLFoundAcc: " + errorLoadingXMLFoundAcc.value().toString());
|
||||||
|
logger.info("errorParsingXMLFoundAcc: " + errorParsingXMLFoundAcc.value().toString());
|
||||||
|
logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
|
||||||
|
logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
|
||||||
|
logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
|
||||||
|
logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
|
||||||
|
logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
|
||||||
|
logger.info("errorHTTPGenericAcc: " + errorHTTPGenericAcc.value().toString());
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isModified(String orcidId, String modifiedDateValue, String lastUpdateValue) {
|
||||||
|
LocalDate modifiedDate = null;
|
||||||
|
LocalDate lastUpdate = null;
|
||||||
|
try {
|
||||||
|
modifiedDate = LocalDate.parse(modifiedDateValue, SparkDownloadOrcidWorks.ORCID_XML_DATETIMEFORMATTER);
|
||||||
|
if (lastUpdateValue.length() != 19) {
|
||||||
|
lastUpdateValue = lastUpdateValue.substring(0, 19);
|
||||||
|
}
|
||||||
|
lastUpdate = LocalDate
|
||||||
|
.parse(lastUpdateValue, SparkDownloadOrcidWorks.LAMBDA_FILE_DATE_FORMATTER);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.info("[" + orcidId + "] Parsing date: ", e.getMessage());
|
||||||
|
throw new RuntimeException("[" + orcidId + "] Parsing date: " + e.getMessage());
|
||||||
|
}
|
||||||
|
return modifiedDate.isAfter(lastUpdate);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getJsonValue(JsonElement jElement, String property) {
|
||||||
|
if (jElement.getAsJsonObject().has(property)) {
|
||||||
|
JsonElement name = null;
|
||||||
|
name = jElement.getAsJsonObject().get(property);
|
||||||
|
if (name != null && !name.isJsonNull()) {
|
||||||
|
return name.getAsString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new String("");
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,9 +3,7 @@ package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.*;
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -17,6 +15,7 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.io.SequenceFile;
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
|
@ -26,6 +25,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||||
|
|
||||||
public class SparkGenLastModifiedSeq {
|
public class SparkGenLastModifiedSeq {
|
||||||
private static String hdfsServerUri;
|
private static String hdfsServerUri;
|
||||||
|
@ -50,6 +50,7 @@ public class SparkGenLastModifiedSeq {
|
||||||
outputPath = parser.get("outputPath");
|
outputPath = parser.get("outputPath");
|
||||||
lambdaFileName = parser.get("lambdaFileName");
|
lambdaFileName = parser.get("lambdaFileName");
|
||||||
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
|
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
|
||||||
|
String lastModifiedDateFromLambdaFileUri = "last_modified_date_from_lambda_file.txt";
|
||||||
|
|
||||||
SparkConf sparkConf = new SparkConf();
|
SparkConf sparkConf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
|
@ -57,6 +58,7 @@ public class SparkGenLastModifiedSeq {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
int rowsNum = 0;
|
int rowsNum = 0;
|
||||||
|
String lastModifiedAuthorDate = "";
|
||||||
Path output = new Path(
|
Path output = new Path(
|
||||||
hdfsServerUri
|
hdfsServerUri
|
||||||
.concat(workingPath)
|
.concat(workingPath)
|
||||||
|
@ -89,10 +91,17 @@ public class SparkGenLastModifiedSeq {
|
||||||
final Text value = new Text(recordInfo.get(3));
|
final Text value = new Text(recordInfo.get(3));
|
||||||
writer.append(key, value);
|
writer.append(key, value);
|
||||||
rowsNum++;
|
rowsNum++;
|
||||||
|
if (rowsNum == 2) {
|
||||||
|
lastModifiedAuthorDate = value.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
HDFSUtil
|
||||||
|
.writeToTextFile(
|
||||||
|
hdfsServerUri, workingPath, lastModifiedDateFromLambdaFileUri, lastModifiedAuthorDate);
|
||||||
Log.info("Saved rows from lamda csv tar file: " + rowsNum);
|
Log.info("Saved rows from lamda csv tar file: " + rowsNum);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,15 +4,13 @@ package eu.dnetlib.doiboost.orcid;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
@ -25,13 +23,15 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.esotericsoftware.minlog.Log;
|
import com.esotericsoftware.minlog.Log;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import com.google.gson.JsonElement;
|
import com.google.gson.JsonElement;
|
||||||
import com.google.gson.JsonParser;
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.OrcidDOI;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class SparkGenerateDoiAuthorList {
|
public class SparkGenerateDoiAuthorList {
|
||||||
|
@ -56,6 +56,10 @@ public class SparkGenerateDoiAuthorList {
|
||||||
logger.info("workingPath: ", workingPath);
|
logger.info("workingPath: ", workingPath);
|
||||||
final String outputDoiAuthorListPath = parser.get("outputDoiAuthorListPath");
|
final String outputDoiAuthorListPath = parser.get("outputDoiAuthorListPath");
|
||||||
logger.info("outputDoiAuthorListPath: ", outputDoiAuthorListPath);
|
logger.info("outputDoiAuthorListPath: ", outputDoiAuthorListPath);
|
||||||
|
final String authorsPath = parser.get("authorsPath");
|
||||||
|
logger.info("authorsPath: ", authorsPath);
|
||||||
|
final String xmlWorksPath = parser.get("xmlWorksPath");
|
||||||
|
logger.info("xmlWorksPath: ", xmlWorksPath);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
|
@ -65,17 +69,21 @@ public class SparkGenerateDoiAuthorList {
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> summariesRDD = sc
|
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||||
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
|
.sequenceFile(workingPath.concat(authorsPath), Text.class, Text.class);
|
||||||
Dataset<AuthorData> summariesDataset = spark
|
Dataset<AuthorData> summariesDataset = spark
|
||||||
.createDataset(
|
.createDataset(
|
||||||
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||||
Encoders.bean(AuthorData.class));
|
Encoders.bean(AuthorData.class));
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> activitiesRDD = sc
|
JavaPairRDD<Text, Text> xmlWorksRDD = sc
|
||||||
.sequenceFile(workingPath + "/output/*.seq", Text.class, Text.class);
|
.sequenceFile(workingPath.concat(xmlWorksPath), Text.class, Text.class);
|
||||||
|
|
||||||
Dataset<WorkData> activitiesDataset = spark
|
Dataset<WorkData> activitiesDataset = spark
|
||||||
.createDataset(
|
.createDataset(
|
||||||
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
|
xmlWorksRDD
|
||||||
|
.map(seq -> XMLRecordParser.VTDParseWorkData(seq._2().toString().getBytes()))
|
||||||
|
.filter(work -> work != null && work.getErrorCode() == null && work.isDoiFound())
|
||||||
|
.rdd(),
|
||||||
Encoders.bean(WorkData.class));
|
Encoders.bean(WorkData.class));
|
||||||
|
|
||||||
Function<Tuple2<String, AuthorData>, Tuple2<String, List<AuthorData>>> toAuthorListFunction = data -> {
|
Function<Tuple2<String, AuthorData>, Tuple2<String, List<AuthorData>>> toAuthorListFunction = data -> {
|
||||||
|
@ -135,13 +143,19 @@ public class SparkGenerateDoiAuthorList {
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
})
|
})
|
||||||
.mapToPair(
|
.mapToPair(s -> {
|
||||||
s -> {
|
List<AuthorData> authorList = s._2();
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
Set<String> oidsAlreadySeen = new HashSet<>();
|
||||||
return new Tuple2<>(s._1(), mapper.writeValueAsString(s._2()));
|
authorList.removeIf(a -> !oidsAlreadySeen.add(a.getOid()));
|
||||||
|
return new Tuple2<>(s._1(), authorList);
|
||||||
})
|
})
|
||||||
.repartition(10)
|
.map(s -> {
|
||||||
.saveAsTextFile(workingPath + outputDoiAuthorListPath);
|
OrcidDOI orcidDOI = new OrcidDOI();
|
||||||
|
orcidDOI.setDoi(s._1());
|
||||||
|
orcidDOI.setAuthors(s._2());
|
||||||
|
return JsonWriter.create(orcidDOI);
|
||||||
|
})
|
||||||
|
.saveAsTextFile(workingPath + outputDoiAuthorListPath, GzipCodec.class);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,242 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
import static org.apache.spark.sql.functions.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
|
||||||
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkUpdateOrcidAuthors {
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
|
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidAuthors.class);
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkUpdateOrcidAuthors.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
// final String outputPath = parser.get("outputPath");
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
LongAccumulator oldAuthorsFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("old_authors_found");
|
||||||
|
LongAccumulator updatedAuthorsFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("updated_authors_found");
|
||||||
|
LongAccumulator newAuthorsFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("new_authors_found");
|
||||||
|
LongAccumulator errorCodeAuthorsFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_code_authors_found");
|
||||||
|
LongAccumulator errorLoadingAuthorsJsonFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_loading_authors_json_found");
|
||||||
|
LongAccumulator errorParsingAuthorsXMLFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_parsing_authors_xml_found");
|
||||||
|
|
||||||
|
Function<Tuple2<Text, Text>, AuthorSummary> retrieveAuthorSummaryFunction = data -> {
|
||||||
|
AuthorSummary authorSummary = new AuthorSummary();
|
||||||
|
String orcidId = data._1().toString();
|
||||||
|
String jsonData = data._2().toString();
|
||||||
|
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||||
|
String statusCode = getJsonValue(jElement, "statusCode");
|
||||||
|
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
|
||||||
|
if (statusCode.equals("200")) {
|
||||||
|
String compressedData = getJsonValue(jElement, "compressedData");
|
||||||
|
if (StringUtils.isEmpty(compressedData)) {
|
||||||
|
errorLoadingAuthorsJsonFoundAcc.add(1);
|
||||||
|
} else {
|
||||||
|
String xmlAuthor = ArgumentApplicationParser.decompressValue(compressedData);
|
||||||
|
try {
|
||||||
|
authorSummary = XMLRecordParser
|
||||||
|
.VTDParseAuthorSummary(xmlAuthor.getBytes());
|
||||||
|
authorSummary.setStatusCode(statusCode);
|
||||||
|
authorSummary.setDownloadDate(Long.toString(System.currentTimeMillis()));
|
||||||
|
authorSummary.setBase64CompressData(compressedData);
|
||||||
|
return authorSummary;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml " + orcidId + " [" + jsonData + "]", e);
|
||||||
|
errorParsingAuthorsXMLFoundAcc.add(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
authorSummary.setStatusCode(statusCode);
|
||||||
|
authorSummary.setDownloadDate(Long.toString(System.currentTimeMillis()));
|
||||||
|
errorCodeAuthorsFoundAcc.add(1);
|
||||||
|
}
|
||||||
|
return authorSummary;
|
||||||
|
};
|
||||||
|
|
||||||
|
Dataset<AuthorSummary> downloadedAuthorSummaryDS = spark
|
||||||
|
.createDataset(
|
||||||
|
sc
|
||||||
|
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class)
|
||||||
|
.map(retrieveAuthorSummaryFunction)
|
||||||
|
.rdd(),
|
||||||
|
Encoders.bean(AuthorSummary.class));
|
||||||
|
Dataset<AuthorSummary> currentAuthorSummaryDS = spark
|
||||||
|
.createDataset(
|
||||||
|
sc
|
||||||
|
.textFile(workingPath.concat("orcid_dataset/authors/*"))
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
|
||||||
|
.rdd(),
|
||||||
|
Encoders.bean(AuthorSummary.class));
|
||||||
|
Dataset<AuthorSummary> mergedAuthorSummaryDS = currentAuthorSummaryDS
|
||||||
|
.joinWith(
|
||||||
|
downloadedAuthorSummaryDS,
|
||||||
|
currentAuthorSummaryDS
|
||||||
|
.col("authorData.oid")
|
||||||
|
.equalTo(downloadedAuthorSummaryDS.col("authorData.oid")),
|
||||||
|
"full_outer")
|
||||||
|
.map(value -> {
|
||||||
|
Optional<AuthorSummary> opCurrent = Optional.ofNullable(value._1());
|
||||||
|
Optional<AuthorSummary> opDownloaded = Optional.ofNullable(value._2());
|
||||||
|
if (!opCurrent.isPresent()) {
|
||||||
|
newAuthorsFoundAcc.add(1);
|
||||||
|
return opDownloaded.get();
|
||||||
|
}
|
||||||
|
if (!opDownloaded.isPresent()) {
|
||||||
|
oldAuthorsFoundAcc.add(1);
|
||||||
|
return opCurrent.get();
|
||||||
|
}
|
||||||
|
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
|
||||||
|
updatedAuthorsFoundAcc.add(1);
|
||||||
|
return opDownloaded.get();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
},
|
||||||
|
Encoders.bean(AuthorSummary.class))
|
||||||
|
.filter(Objects::nonNull);
|
||||||
|
|
||||||
|
long mergedCount = mergedAuthorSummaryDS.count();
|
||||||
|
|
||||||
|
Dataset<AuthorSummary> base64DedupedDS = mergedAuthorSummaryDS.dropDuplicates("base64CompressData");
|
||||||
|
|
||||||
|
List<String> dupOids = base64DedupedDS
|
||||||
|
.groupBy("authorData.oid")
|
||||||
|
.agg(count("authorData.oid").alias("oidOccurrenceCount"))
|
||||||
|
.where("oidOccurrenceCount > 1")
|
||||||
|
.select("oid")
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(row -> row.get(0).toString())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
JavaRDD<AuthorSummary> dupAuthors = base64DedupedDS
|
||||||
|
.toJavaRDD()
|
||||||
|
.filter(
|
||||||
|
authorSummary -> (Objects.nonNull(authorSummary.getAuthorData())
|
||||||
|
&& Objects.nonNull(authorSummary.getAuthorData().getOid())))
|
||||||
|
.filter(authorSummary -> dupOids.contains(authorSummary.getAuthorData().getOid()));
|
||||||
|
|
||||||
|
Dataset<AuthorSummary> dupAuthorSummaryDS = spark
|
||||||
|
.createDataset(
|
||||||
|
dupAuthors.rdd(),
|
||||||
|
Encoders.bean(AuthorSummary.class));
|
||||||
|
List<Tuple2<String, String>> lastModifiedAuthors = dupAuthorSummaryDS
|
||||||
|
.groupBy("authorData.oid")
|
||||||
|
.agg(array_max(collect_list("downloadDate")))
|
||||||
|
.map(
|
||||||
|
row -> new Tuple2<>(row.get(0).toString(), row.get(1).toString()),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||||
|
.toJavaRDD()
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
JavaRDD<AuthorSummary> lastDownloadedAuthors = base64DedupedDS
|
||||||
|
.toJavaRDD()
|
||||||
|
.filter(
|
||||||
|
authorSummary -> (Objects.nonNull(authorSummary.getAuthorData())
|
||||||
|
&& Objects.nonNull(authorSummary.getAuthorData().getOid())))
|
||||||
|
.filter(authorSummary -> {
|
||||||
|
boolean oidFound = lastModifiedAuthors
|
||||||
|
.stream()
|
||||||
|
.filter(a -> a._1().equals(authorSummary.getAuthorData().getOid()))
|
||||||
|
.count() == 1;
|
||||||
|
boolean tsFound = lastModifiedAuthors
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
a -> a._1().equals(authorSummary.getAuthorData().getOid()) &&
|
||||||
|
a._2().equals(authorSummary.getDownloadDate()))
|
||||||
|
.count() == 1;
|
||||||
|
return (oidFound && tsFound) || (!oidFound);
|
||||||
|
});
|
||||||
|
|
||||||
|
Dataset<AuthorSummary> cleanedDS = spark
|
||||||
|
.createDataset(
|
||||||
|
lastDownloadedAuthors.rdd(),
|
||||||
|
Encoders.bean(AuthorSummary.class))
|
||||||
|
.dropDuplicates("downloadDate", "authorData");
|
||||||
|
cleanedDS
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(authorSummary -> OBJECT_MAPPER.writeValueAsString(authorSummary))
|
||||||
|
.saveAsTextFile(workingPath.concat("orcid_dataset/new_authors"), GzipCodec.class);
|
||||||
|
long cleanedDSCount = cleanedDS.count();
|
||||||
|
|
||||||
|
logger.info("report_oldAuthorsFoundAcc: " + oldAuthorsFoundAcc.value().toString());
|
||||||
|
logger.info("report_newAuthorsFoundAcc: " + newAuthorsFoundAcc.value().toString());
|
||||||
|
logger.info("report_updatedAuthorsFoundAcc: " + updatedAuthorsFoundAcc.value().toString());
|
||||||
|
logger.info("report_errorCodeFoundAcc: " + errorCodeAuthorsFoundAcc.value().toString());
|
||||||
|
logger.info("report_errorLoadingJsonFoundAcc: " + errorLoadingAuthorsJsonFoundAcc.value().toString());
|
||||||
|
logger.info("report_errorParsingXMLFoundAcc: " + errorParsingAuthorsXMLFoundAcc.value().toString());
|
||||||
|
logger.info("report_merged_count: " + mergedCount);
|
||||||
|
logger.info("report_cleaned_count: " + cleanedDSCount);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getJsonValue(JsonElement jElement, String property) {
|
||||||
|
if (jElement.getAsJsonObject().has(property)) {
|
||||||
|
JsonElement name = null;
|
||||||
|
name = jElement.getAsJsonObject().get(property);
|
||||||
|
if (name != null && !name.isJsonNull()) {
|
||||||
|
return name.getAsString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,317 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.Work;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkUpdateOrcidDatasets {
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
|
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidDatasets.class);
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkUpdateOrcidDatasets.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
// final String outputPath = parser.get("outputPath");
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
LongAccumulator oldAuthorsFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("old_authors_found");
|
||||||
|
LongAccumulator updatedAuthorsFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("updated_authors_found");
|
||||||
|
LongAccumulator newAuthorsFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("new_authors_found");
|
||||||
|
LongAccumulator errorCodeAuthorsFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_code_authors_found");
|
||||||
|
LongAccumulator errorLoadingAuthorsJsonFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_loading_authors_json_found");
|
||||||
|
LongAccumulator errorParsingAuthorsXMLFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_parsing_authors_xml_found");
|
||||||
|
|
||||||
|
LongAccumulator oldWorksFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("old_works_found");
|
||||||
|
LongAccumulator updatedWorksFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("updated_works_found");
|
||||||
|
LongAccumulator newWorksFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("new_works_found");
|
||||||
|
LongAccumulator errorCodeWorksFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_code_works_found");
|
||||||
|
LongAccumulator errorLoadingWorksJsonFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_loading_works_json_found");
|
||||||
|
LongAccumulator errorParsingWorksXMLFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_parsing_works_xml_found");
|
||||||
|
|
||||||
|
// JavaPairRDD<Text, Text> xmlSummariesRDD = sc
|
||||||
|
// .sequenceFile(workingPath.concat("xml/authors/xml_authors.seq"), Text.class, Text.class);
|
||||||
|
// xmlSummariesRDD
|
||||||
|
// .map(seq -> {
|
||||||
|
// AuthorSummary authorSummary = XMLRecordParser
|
||||||
|
// .VTDParseAuthorSummary(seq._2().toString().getBytes());
|
||||||
|
// authorSummary
|
||||||
|
// .setBase64CompressData(ArgumentApplicationParser.compressArgument(seq._2().toString()));
|
||||||
|
// return authorSummary;
|
||||||
|
// })
|
||||||
|
// .filter(authorSummary -> authorSummary != null)
|
||||||
|
// .map(authorSummary -> JsonWriter.create(authorSummary))
|
||||||
|
// .saveAsTextFile(workingPath.concat("orcid_dataset/authors"), GzipCodec.class);
|
||||||
|
//
|
||||||
|
// JavaPairRDD<Text, Text> xmlWorksRDD = sc
|
||||||
|
// .sequenceFile(workingPath.concat("xml/works/*"), Text.class, Text.class);
|
||||||
|
//
|
||||||
|
// xmlWorksRDD
|
||||||
|
// .map(seq -> {
|
||||||
|
// WorkDetail workDetail = XMLRecordParserNoDoi.VTDParseWorkData(seq._2().toString().getBytes());
|
||||||
|
// Work work = new Work();
|
||||||
|
// work.setWorkDetail(workDetail);
|
||||||
|
// work.setBase64CompressData(ArgumentApplicationParser.compressArgument(seq._2().toString()));
|
||||||
|
// return work;
|
||||||
|
// })
|
||||||
|
// .filter(work -> work != null)
|
||||||
|
// .map(work -> JsonWriter.create(work))
|
||||||
|
// .saveAsTextFile(workingPath.concat("orcid_dataset/works"), GzipCodec.class);
|
||||||
|
|
||||||
|
// Function<Tuple2<Text, Text>, AuthorSummary> retrieveAuthorSummaryFunction = data -> {
|
||||||
|
// AuthorSummary authorSummary = new AuthorSummary();
|
||||||
|
// String orcidId = data._1().toString();
|
||||||
|
// String jsonData = data._2().toString();
|
||||||
|
// JsonElement jElement = new JsonParser().parse(jsonData);
|
||||||
|
// String statusCode = getJsonValue(jElement, "statusCode");
|
||||||
|
// String downloadDate = getJsonValue(jElement, "lastModifiedDate");
|
||||||
|
// if (statusCode.equals("200")) {
|
||||||
|
// String compressedData = getJsonValue(jElement, "compressedData");
|
||||||
|
// if (StringUtils.isEmpty(compressedData)) {
|
||||||
|
// errorLoadingAuthorsJsonFoundAcc.add(1);
|
||||||
|
// } else {
|
||||||
|
// String xmlAuthor = ArgumentApplicationParser.decompressValue(compressedData);
|
||||||
|
// try {
|
||||||
|
// authorSummary = XMLRecordParser
|
||||||
|
// .VTDParseAuthorSummary(xmlAuthor.getBytes());
|
||||||
|
// authorSummary.setStatusCode(statusCode);
|
||||||
|
// authorSummary.setDownloadDate("2020-11-18 00:00:05.644768");
|
||||||
|
// authorSummary.setBase64CompressData(compressedData);
|
||||||
|
// return authorSummary;
|
||||||
|
// } catch (Exception e) {
|
||||||
|
// logger.error("parsing xml " + orcidId + " [" + jsonData + "]", e);
|
||||||
|
// errorParsingAuthorsXMLFoundAcc.add(1);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// } else {
|
||||||
|
// authorSummary.setStatusCode(statusCode);
|
||||||
|
// authorSummary.setDownloadDate("2020-11-18 00:00:05.644768");
|
||||||
|
// errorCodeAuthorsFoundAcc.add(1);
|
||||||
|
// }
|
||||||
|
// return authorSummary;
|
||||||
|
// };
|
||||||
|
//
|
||||||
|
// Dataset<AuthorSummary> downloadedAuthorSummaryDS = spark
|
||||||
|
// .createDataset(
|
||||||
|
// sc
|
||||||
|
// .sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class)
|
||||||
|
// .map(retrieveAuthorSummaryFunction)
|
||||||
|
// .rdd(),
|
||||||
|
// Encoders.bean(AuthorSummary.class));
|
||||||
|
// Dataset<AuthorSummary> currentAuthorSummaryDS = spark
|
||||||
|
// .createDataset(
|
||||||
|
// sc
|
||||||
|
// .textFile(workingPath.concat("orcid_dataset/authors/*"))
|
||||||
|
// .map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
|
||||||
|
// .rdd(),
|
||||||
|
// Encoders.bean(AuthorSummary.class));
|
||||||
|
// currentAuthorSummaryDS
|
||||||
|
// .joinWith(
|
||||||
|
// downloadedAuthorSummaryDS,
|
||||||
|
// currentAuthorSummaryDS
|
||||||
|
// .col("authorData.oid")
|
||||||
|
// .equalTo(downloadedAuthorSummaryDS.col("authorData.oid")),
|
||||||
|
// "full_outer")
|
||||||
|
// .map(value -> {
|
||||||
|
// Optional<AuthorSummary> opCurrent = Optional.ofNullable(value._1());
|
||||||
|
// Optional<AuthorSummary> opDownloaded = Optional.ofNullable(value._2());
|
||||||
|
// if (!opCurrent.isPresent()) {
|
||||||
|
// newAuthorsFoundAcc.add(1);
|
||||||
|
// return opDownloaded.get();
|
||||||
|
// }
|
||||||
|
// if (!opDownloaded.isPresent()) {
|
||||||
|
// oldAuthorsFoundAcc.add(1);
|
||||||
|
// return opCurrent.get();
|
||||||
|
// }
|
||||||
|
// if (opCurrent.isPresent() && opDownloaded.isPresent()) {
|
||||||
|
// updatedAuthorsFoundAcc.add(1);
|
||||||
|
// return opDownloaded.get();
|
||||||
|
// }
|
||||||
|
// return null;
|
||||||
|
// },
|
||||||
|
// Encoders.bean(AuthorSummary.class))
|
||||||
|
// .filter(Objects::nonNull)
|
||||||
|
// .toJavaRDD()
|
||||||
|
// .map(authorSummary -> OBJECT_MAPPER.writeValueAsString(authorSummary))
|
||||||
|
// .saveAsTextFile(workingPath.concat("orcid_dataset/new_authors"), GzipCodec.class);
|
||||||
|
//
|
||||||
|
// logger.info("oldAuthorsFoundAcc: " + oldAuthorsFoundAcc.value().toString());
|
||||||
|
// logger.info("newAuthorsFoundAcc: " + newAuthorsFoundAcc.value().toString());
|
||||||
|
// logger.info("updatedAuthorsFoundAcc: " + updatedAuthorsFoundAcc.value().toString());
|
||||||
|
// logger.info("errorCodeFoundAcc: " + errorCodeAuthorsFoundAcc.value().toString());
|
||||||
|
// logger.info("errorLoadingJsonFoundAcc: " + errorLoadingAuthorsJsonFoundAcc.value().toString());
|
||||||
|
// logger.info("errorParsingXMLFoundAcc: " + errorParsingAuthorsXMLFoundAcc.value().toString());
|
||||||
|
|
||||||
|
Function<String, Work> retrieveWorkFunction = jsonData -> {
|
||||||
|
Work work = new Work();
|
||||||
|
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||||
|
String statusCode = getJsonValue(jElement, "statusCode");
|
||||||
|
work.setStatusCode(statusCode);
|
||||||
|
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
|
||||||
|
work.setDownloadDate("2020-11-18 00:00:05.644768");
|
||||||
|
if (statusCode.equals("200")) {
|
||||||
|
String compressedData = getJsonValue(jElement, "compressedData");
|
||||||
|
if (StringUtils.isEmpty(compressedData)) {
|
||||||
|
errorLoadingWorksJsonFoundAcc.add(1);
|
||||||
|
} else {
|
||||||
|
String xmlWork = ArgumentApplicationParser.decompressValue(compressedData);
|
||||||
|
try {
|
||||||
|
WorkDetail workDetail = XMLRecordParserNoDoi
|
||||||
|
.VTDParseWorkData(xmlWork.getBytes());
|
||||||
|
work.setWorkDetail(workDetail);
|
||||||
|
work.setBase64CompressData(compressedData);
|
||||||
|
return work;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml [" + jsonData + "]", e);
|
||||||
|
errorParsingWorksXMLFoundAcc.add(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
errorCodeWorksFoundAcc.add(1);
|
||||||
|
}
|
||||||
|
return work;
|
||||||
|
};
|
||||||
|
|
||||||
|
Dataset<Work> downloadedWorksDS = spark
|
||||||
|
.createDataset(
|
||||||
|
sc
|
||||||
|
.textFile(workingPath + "downloads/updated_works/*")
|
||||||
|
.map(s -> {
|
||||||
|
return s.substring(21, s.length() - 1);
|
||||||
|
})
|
||||||
|
.map(retrieveWorkFunction)
|
||||||
|
.rdd(),
|
||||||
|
Encoders.bean(Work.class));
|
||||||
|
Dataset<Work> currentWorksDS = spark
|
||||||
|
.createDataset(
|
||||||
|
sc
|
||||||
|
.textFile(workingPath.concat("orcid_dataset/works/*"))
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Work.class))
|
||||||
|
.rdd(),
|
||||||
|
Encoders.bean(Work.class));
|
||||||
|
currentWorksDS
|
||||||
|
.joinWith(
|
||||||
|
downloadedWorksDS,
|
||||||
|
currentWorksDS
|
||||||
|
.col("workDetail.id")
|
||||||
|
.equalTo(downloadedWorksDS.col("workDetail.id"))
|
||||||
|
.and(
|
||||||
|
currentWorksDS
|
||||||
|
.col("workDetail.oid")
|
||||||
|
.equalTo(downloadedWorksDS.col("workDetail.oid"))),
|
||||||
|
"full_outer")
|
||||||
|
.map(value -> {
|
||||||
|
Optional<Work> opCurrent = Optional.ofNullable(value._1());
|
||||||
|
Optional<Work> opDownloaded = Optional.ofNullable(value._2());
|
||||||
|
if (!opCurrent.isPresent()) {
|
||||||
|
newWorksFoundAcc.add(1);
|
||||||
|
return opDownloaded.get();
|
||||||
|
}
|
||||||
|
if (!opDownloaded.isPresent()) {
|
||||||
|
oldWorksFoundAcc.add(1);
|
||||||
|
return opCurrent.get();
|
||||||
|
}
|
||||||
|
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
|
||||||
|
updatedWorksFoundAcc.add(1);
|
||||||
|
return opDownloaded.get();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
},
|
||||||
|
Encoders.bean(Work.class))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(work -> OBJECT_MAPPER.writeValueAsString(work))
|
||||||
|
.saveAsTextFile(workingPath.concat("orcid_dataset/new_works"), GzipCodec.class);
|
||||||
|
|
||||||
|
logger.info("oldWorksFoundAcc: " + oldWorksFoundAcc.value().toString());
|
||||||
|
logger.info("newWorksFoundAcc: " + newWorksFoundAcc.value().toString());
|
||||||
|
logger.info("updatedWorksFoundAcc: " + updatedWorksFoundAcc.value().toString());
|
||||||
|
logger.info("errorCodeWorksFoundAcc: " + errorCodeWorksFoundAcc.value().toString());
|
||||||
|
logger.info("errorLoadingJsonWorksFoundAcc: " + errorLoadingWorksJsonFoundAcc.value().toString());
|
||||||
|
logger.info("errorParsingXMLWorksFoundAcc: " + errorParsingWorksXMLFoundAcc.value().toString());
|
||||||
|
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getJsonValue(JsonElement jElement, String property) {
|
||||||
|
if (jElement.getAsJsonObject().has(property)) {
|
||||||
|
JsonElement name = null;
|
||||||
|
name = jElement.getAsJsonObject().get(property);
|
||||||
|
if (name != null && !name.isJsonNull()) {
|
||||||
|
return name.getAsString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,186 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.Work;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
|
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
|
||||||
|
public class SparkUpdateOrcidWorks {
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
|
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidWorks.class);
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkUpdateOrcidWorks.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
LongAccumulator oldWorksFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("old_works_found");
|
||||||
|
LongAccumulator updatedWorksFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("updated_works_found");
|
||||||
|
LongAccumulator newWorksFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("new_works_found");
|
||||||
|
LongAccumulator errorCodeWorksFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_code_works_found");
|
||||||
|
LongAccumulator errorLoadingWorksJsonFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_loading_works_json_found");
|
||||||
|
LongAccumulator errorParsingWorksXMLFoundAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("error_parsing_works_xml_found");
|
||||||
|
|
||||||
|
Function<String, Work> retrieveWorkFunction = jsonData -> {
|
||||||
|
Work work = new Work();
|
||||||
|
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||||
|
String statusCode = getJsonValue(jElement, "statusCode");
|
||||||
|
work.setStatusCode(statusCode);
|
||||||
|
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
|
||||||
|
work.setDownloadDate(Long.toString(System.currentTimeMillis()));
|
||||||
|
if (statusCode.equals("200")) {
|
||||||
|
String compressedData = getJsonValue(jElement, "compressedData");
|
||||||
|
if (StringUtils.isEmpty(compressedData)) {
|
||||||
|
errorLoadingWorksJsonFoundAcc.add(1);
|
||||||
|
} else {
|
||||||
|
String xmlWork = ArgumentApplicationParser.decompressValue(compressedData);
|
||||||
|
try {
|
||||||
|
WorkDetail workDetail = XMLRecordParserNoDoi
|
||||||
|
.VTDParseWorkData(xmlWork.getBytes());
|
||||||
|
work.setWorkDetail(workDetail);
|
||||||
|
work.setBase64CompressData(compressedData);
|
||||||
|
return work;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml [" + jsonData + "]", e);
|
||||||
|
errorParsingWorksXMLFoundAcc.add(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
errorCodeWorksFoundAcc.add(1);
|
||||||
|
}
|
||||||
|
return work;
|
||||||
|
};
|
||||||
|
|
||||||
|
Dataset<Work> downloadedWorksDS = spark
|
||||||
|
.createDataset(
|
||||||
|
sc
|
||||||
|
.textFile(workingPath + "downloads/updated_works/*")
|
||||||
|
.map(s -> {
|
||||||
|
return s.substring(21, s.length() - 1);
|
||||||
|
})
|
||||||
|
.map(retrieveWorkFunction)
|
||||||
|
.rdd(),
|
||||||
|
Encoders.bean(Work.class));
|
||||||
|
Dataset<Work> currentWorksDS = spark
|
||||||
|
.createDataset(
|
||||||
|
sc
|
||||||
|
.textFile(workingPath.concat("orcid_dataset/works/*"))
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Work.class))
|
||||||
|
.rdd(),
|
||||||
|
Encoders.bean(Work.class));
|
||||||
|
currentWorksDS
|
||||||
|
.joinWith(
|
||||||
|
downloadedWorksDS,
|
||||||
|
currentWorksDS
|
||||||
|
.col("workDetail.id")
|
||||||
|
.equalTo(downloadedWorksDS.col("workDetail.id"))
|
||||||
|
.and(
|
||||||
|
currentWorksDS
|
||||||
|
.col("workDetail.oid")
|
||||||
|
.equalTo(downloadedWorksDS.col("workDetail.oid"))),
|
||||||
|
"full_outer")
|
||||||
|
.map(value -> {
|
||||||
|
Optional<Work> opCurrent = Optional.ofNullable(value._1());
|
||||||
|
Optional<Work> opDownloaded = Optional.ofNullable(value._2());
|
||||||
|
if (!opCurrent.isPresent()) {
|
||||||
|
newWorksFoundAcc.add(1);
|
||||||
|
return opDownloaded.get();
|
||||||
|
}
|
||||||
|
if (!opDownloaded.isPresent()) {
|
||||||
|
oldWorksFoundAcc.add(1);
|
||||||
|
return opCurrent.get();
|
||||||
|
}
|
||||||
|
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
|
||||||
|
updatedWorksFoundAcc.add(1);
|
||||||
|
return opDownloaded.get();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
},
|
||||||
|
Encoders.bean(Work.class))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(work -> OBJECT_MAPPER.writeValueAsString(work))
|
||||||
|
.saveAsTextFile(workingPath.concat("orcid_dataset/new_works"), GzipCodec.class);
|
||||||
|
|
||||||
|
logger.info("oldWorksFoundAcc: " + oldWorksFoundAcc.value().toString());
|
||||||
|
logger.info("newWorksFoundAcc: " + newWorksFoundAcc.value().toString());
|
||||||
|
logger.info("updatedWorksFoundAcc: " + updatedWorksFoundAcc.value().toString());
|
||||||
|
logger.info("errorCodeWorksFoundAcc: " + errorCodeWorksFoundAcc.value().toString());
|
||||||
|
logger.info("errorLoadingJsonWorksFoundAcc: " + errorLoadingWorksJsonFoundAcc.value().toString());
|
||||||
|
logger.info("errorParsingXMLWorksFoundAcc: " + errorParsingWorksXMLFoundAcc.value().toString());
|
||||||
|
|
||||||
|
String lastModifiedDateFromLambdaFile = HDFSUtil
|
||||||
|
.readFromTextFile(hdfsServerUri, workingPath, "last_modified_date_from_lambda_file.txt");
|
||||||
|
HDFSUtil.writeToTextFile(hdfsServerUri, workingPath, "last_update.txt", lastModifiedDateFromLambdaFile);
|
||||||
|
logger.info("last_update file updated");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getJsonValue(JsonElement jElement, String property) {
|
||||||
|
if (jElement.getAsJsonObject().has(property)) {
|
||||||
|
JsonElement name = null;
|
||||||
|
name = jElement.getAsJsonObject().get(property);
|
||||||
|
if (name != null && !name.isJsonNull()) {
|
||||||
|
return name.getAsString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,11 +3,11 @@ package eu.dnetlib.doiboost.orcid.json;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
|
|
||||||
public class JsonHelper {
|
public class JsonHelper {
|
||||||
|
|
||||||
public static String createOidWork(WorkDataNoDoi workData) {
|
public static String createOidWork(WorkDetail workData) {
|
||||||
return new Gson().toJson(workData);
|
return new Gson().toJson(workData);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid.util;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors;
|
||||||
|
|
||||||
|
public class HDFSUtil {
|
||||||
|
|
||||||
|
static Logger logger = LoggerFactory.getLogger(HDFSUtil.class);
|
||||||
|
|
||||||
|
private static FileSystem getFileSystem(String hdfsServerUri) throws IOException {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set("fs.defaultFS", hdfsServerUri);
|
||||||
|
FileSystem fileSystem = FileSystem.get(conf);
|
||||||
|
return fileSystem;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String readFromTextFile(String hdfsServerUri, String workingPath, String path) throws IOException {
|
||||||
|
FileSystem fileSystem = getFileSystem(hdfsServerUri);
|
||||||
|
Path toReadPath = new Path(workingPath.concat(path));
|
||||||
|
if (!fileSystem.exists(toReadPath)) {
|
||||||
|
throw new RuntimeException("File not exist: " + path);
|
||||||
|
}
|
||||||
|
logger.info("Last_update_path " + toReadPath.toString());
|
||||||
|
FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(toReadPath));
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
try {
|
||||||
|
String line;
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
sb.append(line);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
String buffer = sb.toString();
|
||||||
|
logger.info("Last_update: " + buffer);
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void writeToTextFile(String hdfsServerUri, String workingPath, String path, String text)
|
||||||
|
throws IOException {
|
||||||
|
FileSystem fileSystem = getFileSystem(hdfsServerUri);
|
||||||
|
Path toWritePath = new Path(workingPath.concat(path));
|
||||||
|
if (fileSystem.exists(toWritePath)) {
|
||||||
|
fileSystem.delete(toWritePath, true);
|
||||||
|
}
|
||||||
|
FSDataOutputStream os = fileSystem.create(toWritePath);
|
||||||
|
BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8"));
|
||||||
|
br.write(text);
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,22 +1,19 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid.xml;
|
package eu.dnetlib.doiboost.orcid.xml;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import com.ximpleware.AutoPilot;
|
import com.ximpleware.*;
|
||||||
import com.ximpleware.EOFException;
|
|
||||||
import com.ximpleware.EncodingException;
|
|
||||||
import com.ximpleware.EntityException;
|
|
||||||
import com.ximpleware.ParseException;
|
|
||||||
import com.ximpleware.VTDGen;
|
|
||||||
import com.ximpleware.VTDNav;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorHistory;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
|
|
||||||
public class XMLRecordParser {
|
public class XMLRecordParser {
|
||||||
|
@ -32,9 +29,12 @@ public class XMLRecordParser {
|
||||||
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
|
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
|
||||||
private static final String NS_RECORD = "record";
|
private static final String NS_RECORD = "record";
|
||||||
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
|
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
|
||||||
|
private static final String NS_ACTIVITIES = "activities";
|
||||||
|
private static final String NS_ACTIVITIES_URL = "http://www.orcid.org/ns/activities";
|
||||||
private static final String NS_WORK = "work";
|
private static final String NS_WORK = "work";
|
||||||
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
|
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
|
||||||
|
private static final String NS_HISTORY = "history";
|
||||||
|
private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history";
|
||||||
|
|
||||||
private static final String NS_ERROR = "error";
|
private static final String NS_ERROR = "error";
|
||||||
|
|
||||||
|
@ -51,6 +51,7 @@ public class XMLRecordParser {
|
||||||
ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
|
ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
|
||||||
ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
|
ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
|
||||||
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_HISTORY, NS_HISTORY_URL);
|
||||||
|
|
||||||
AuthorData authorData = new AuthorData();
|
AuthorData authorData = new AuthorData();
|
||||||
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
||||||
|
@ -89,6 +90,46 @@ public class XMLRecordParser {
|
||||||
authorData.setOtherNames(otherNames);
|
authorData.setOtherNames(otherNames);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// final String creationMethod = VtdUtilityParser.getSingleValue(ap, vn, "//history:creation-method");
|
||||||
|
// if (StringUtils.isNoneBlank(creationMethod)) {
|
||||||
|
// authorData.setCreationMethod(creationMethod);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// final String completionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:completion-date");
|
||||||
|
// if (StringUtils.isNoneBlank(completionDate)) {
|
||||||
|
// authorData.setCompletionDate(completionDate);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// final String submissionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:submission-date");
|
||||||
|
// if (StringUtils.isNoneBlank(submissionDate)) {
|
||||||
|
// authorData.setSubmissionDate(submissionDate);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// final String claimed = VtdUtilityParser.getSingleValue(ap, vn, "//history:claimed");
|
||||||
|
// if (StringUtils.isNoneBlank(claimed)) {
|
||||||
|
// authorData.setClaimed(Boolean.parseBoolean(claimed));
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// final String verifiedEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-email");
|
||||||
|
// if (StringUtils.isNoneBlank(verifiedEmail)) {
|
||||||
|
// authorData.setVerifiedEmail(Boolean.parseBoolean(verifiedEmail));
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// final String verifiedPrimaryEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-primary-email");
|
||||||
|
// if (StringUtils.isNoneBlank(verifiedPrimaryEmail)) {
|
||||||
|
// authorData.setVerifiedPrimaryEmail(Boolean.parseBoolean(verifiedPrimaryEmail));
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// final String deactivationDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:deactivation-date");
|
||||||
|
// if (StringUtils.isNoneBlank(deactivationDate)) {
|
||||||
|
// authorData.setDeactivationDate(deactivationDate);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// final String lastModifiedDate = VtdUtilityParser
|
||||||
|
// .getSingleValue(ap, vn, "//history:history/common:last-modified-date");
|
||||||
|
// if (StringUtils.isNoneBlank(lastModifiedDate)) {
|
||||||
|
// authorData.setLastModifiedDate(lastModifiedDate);
|
||||||
|
// }
|
||||||
return authorData;
|
return authorData;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -139,6 +180,12 @@ public class XMLRecordParser {
|
||||||
return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code");
|
return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String retrieveWorkIdFromSummary(byte[] bytes, String defaultValue)
|
||||||
|
throws VtdException, ParseException {
|
||||||
|
return retrieveOrcidId(
|
||||||
|
bytes, defaultValue, NS_ACTIVITIES, NS_ACTIVITIES_URL, "//work:work-summary", "put-code");
|
||||||
|
}
|
||||||
|
|
||||||
private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath,
|
private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath,
|
||||||
String idAttributeName)
|
String idAttributeName)
|
||||||
throws VtdException, ParseException {
|
throws VtdException, ParseException {
|
||||||
|
@ -148,6 +195,7 @@ public class XMLRecordParser {
|
||||||
final VTDNav vn = vg.getNav();
|
final VTDNav vn = vg.getNav();
|
||||||
final AutoPilot ap = new AutoPilot(vn);
|
final AutoPilot ap = new AutoPilot(vn);
|
||||||
ap.declareXPathNameSpace(ns, nsUrl);
|
ap.declareXPathNameSpace(ns, nsUrl);
|
||||||
|
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
|
||||||
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
|
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
|
||||||
.getTextValuesWithAttributes(
|
.getTextValuesWithAttributes(
|
||||||
ap, vn, xpath, Arrays.asList(idAttributeName));
|
ap, vn, xpath, Arrays.asList(idAttributeName));
|
||||||
|
@ -157,4 +205,144 @@ public class XMLRecordParser {
|
||||||
Log.info("id not found - default: " + defaultValue);
|
Log.info("id not found - default: " + defaultValue);
|
||||||
return defaultValue;
|
return defaultValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Map<String, String> retrieveWorkIdLastModifiedDate(byte[] bytes)
|
||||||
|
throws ParseException, XPathParseException, NavException, XPathEvalException, IOException {
|
||||||
|
final VTDGen vg = new VTDGen();
|
||||||
|
vg.setDoc(bytes);
|
||||||
|
vg.parse(true);
|
||||||
|
final VTDNav vn = vg.getNav();
|
||||||
|
final AutoPilot ap = new AutoPilot(vn);
|
||||||
|
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
|
||||||
|
Map<String, String> workIdLastModifiedDate = new HashMap<>();
|
||||||
|
ap.selectXPath("//work:work-summary");
|
||||||
|
String workId = "";
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
String lastModifiedDate = "";
|
||||||
|
int attr = vn.getAttrVal("put-code");
|
||||||
|
if (attr > -1) {
|
||||||
|
workId = vn.toNormalizedString(attr);
|
||||||
|
}
|
||||||
|
if (vn.toElement(VTDNav.FIRST_CHILD, "common:last-modified-date")) {
|
||||||
|
int val = vn.getText();
|
||||||
|
if (val != -1) {
|
||||||
|
lastModifiedDate = vn.toNormalizedString(val);
|
||||||
|
workIdLastModifiedDate.put(workId, lastModifiedDate);
|
||||||
|
}
|
||||||
|
vn.toElement(VTDNav.PARENT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return workIdLastModifiedDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static AuthorSummary VTDParseAuthorSummary(byte[] bytes)
|
||||||
|
throws VtdException, ParseException {
|
||||||
|
final VTDGen vg = new VTDGen();
|
||||||
|
vg.setDoc(bytes);
|
||||||
|
vg.parse(true);
|
||||||
|
final VTDNav vn = vg.getNav();
|
||||||
|
final AutoPilot ap = new AutoPilot(vn);
|
||||||
|
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_HISTORY, NS_HISTORY_URL);
|
||||||
|
|
||||||
|
AuthorData authorData = retrieveAuthorData(ap, vn, bytes);
|
||||||
|
AuthorHistory authorHistory = retrieveAuthorHistory(ap, vn, bytes);
|
||||||
|
AuthorSummary authorSummary = new AuthorSummary();
|
||||||
|
authorSummary.setAuthorData(authorData);
|
||||||
|
authorSummary.setAuthorHistory(authorHistory);
|
||||||
|
return authorSummary;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AuthorData retrieveAuthorData(AutoPilot ap, VTDNav vn, byte[] bytes)
|
||||||
|
throws VtdException {
|
||||||
|
AuthorData authorData = new AuthorData();
|
||||||
|
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
||||||
|
if (!errors.isEmpty()) {
|
||||||
|
authorData.setErrorCode(errors.get(0));
|
||||||
|
return authorData;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
|
||||||
|
.getTextValuesWithAttributes(
|
||||||
|
ap, vn, "//record:record", Arrays.asList("path"));
|
||||||
|
if (!recordNodes.isEmpty()) {
|
||||||
|
final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1);
|
||||||
|
authorData.setOid(oid);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> names = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names");
|
||||||
|
if (!names.isEmpty()) {
|
||||||
|
authorData.setName(names.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> surnames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name");
|
||||||
|
if (!surnames.isEmpty()) {
|
||||||
|
authorData.setSurname(surnames.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> creditNames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name");
|
||||||
|
if (!creditNames.isEmpty()) {
|
||||||
|
authorData.setCreditName(creditNames.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> otherNames = VtdUtilityParser.getTextValue(ap, vn, "//other-name:content");
|
||||||
|
if (!otherNames.isEmpty()) {
|
||||||
|
authorData.setOtherNames(otherNames);
|
||||||
|
}
|
||||||
|
return authorData;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AuthorHistory retrieveAuthorHistory(AutoPilot ap, VTDNav vn, byte[] bytes)
|
||||||
|
throws VtdException {
|
||||||
|
AuthorHistory authorHistory = new AuthorHistory();
|
||||||
|
final String creationMethod = VtdUtilityParser.getSingleValue(ap, vn, "//history:creation-method");
|
||||||
|
if (StringUtils.isNoneBlank(creationMethod)) {
|
||||||
|
authorHistory.setCreationMethod(creationMethod);
|
||||||
|
}
|
||||||
|
|
||||||
|
final String completionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:completion-date");
|
||||||
|
if (StringUtils.isNoneBlank(completionDate)) {
|
||||||
|
authorHistory.setCompletionDate(completionDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
final String submissionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:submission-date");
|
||||||
|
if (StringUtils.isNoneBlank(submissionDate)) {
|
||||||
|
authorHistory.setSubmissionDate(submissionDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
final String claimed = VtdUtilityParser.getSingleValue(ap, vn, "//history:claimed");
|
||||||
|
if (StringUtils.isNoneBlank(claimed)) {
|
||||||
|
authorHistory.setClaimed(Boolean.parseBoolean(claimed));
|
||||||
|
}
|
||||||
|
|
||||||
|
final String verifiedEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-email");
|
||||||
|
if (StringUtils.isNoneBlank(verifiedEmail)) {
|
||||||
|
authorHistory.setVerifiedEmail(Boolean.parseBoolean(verifiedEmail));
|
||||||
|
}
|
||||||
|
|
||||||
|
final String verifiedPrimaryEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-primary-email");
|
||||||
|
if (StringUtils.isNoneBlank(verifiedPrimaryEmail)) {
|
||||||
|
authorHistory.setVerifiedPrimaryEmail(Boolean.parseBoolean(verifiedPrimaryEmail));
|
||||||
|
}
|
||||||
|
|
||||||
|
final String deactivationDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:deactivation-date");
|
||||||
|
if (StringUtils.isNoneBlank(deactivationDate)) {
|
||||||
|
authorHistory.setDeactivationDate(deactivationDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
final String lastModifiedDate = VtdUtilityParser
|
||||||
|
.getSingleValue(ap, vn, "//history:history/common:last-modified-date");
|
||||||
|
if (StringUtils.isNoneBlank(lastModifiedDate)) {
|
||||||
|
authorHistory.setLastModifiedDate(lastModifiedDate);
|
||||||
|
}
|
||||||
|
return authorHistory;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,8 +19,8 @@ import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -87,29 +87,29 @@ public class ActivitiesDumpReader {
|
||||||
while ((line = br.readLine()) != null) {
|
while ((line = br.readLine()) != null) {
|
||||||
buffer.append(line);
|
buffer.append(line);
|
||||||
}
|
}
|
||||||
WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi
|
WorkDetail workDetail = XMLRecordParserNoDoi
|
||||||
.VTDParseWorkData(buffer.toString().getBytes());
|
.VTDParseWorkData(buffer.toString().getBytes());
|
||||||
if (workDataNoDoi != null) {
|
if (workDetail != null) {
|
||||||
if (workDataNoDoi.getErrorCode() != null) {
|
if (workDetail.getErrorCode() != null) {
|
||||||
errorFromOrcidFound += 1;
|
errorFromOrcidFound += 1;
|
||||||
Log
|
Log
|
||||||
.debug(
|
.debug(
|
||||||
"error from Orcid with code "
|
"error from Orcid with code "
|
||||||
+ workDataNoDoi.getErrorCode()
|
+ workDetail.getErrorCode()
|
||||||
+ " for entry "
|
+ " for entry "
|
||||||
+ entry.getName());
|
+ entry.getName());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
boolean isDoiFound = workDataNoDoi
|
boolean isDoiFound = workDetail
|
||||||
.getExtIds()
|
.getExtIds()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(e -> e.getType() != null)
|
.filter(e -> e.getType() != null)
|
||||||
.anyMatch(e -> e.getType().equals("doi"));
|
.anyMatch(e -> e.getType().equals("doi"));
|
||||||
if (!isDoiFound) {
|
if (!isDoiFound) {
|
||||||
String jsonData = JsonHelper.createOidWork(workDataNoDoi);
|
String jsonData = JsonHelper.createOidWork(workDetail);
|
||||||
Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
|
Log.debug("oid: " + workDetail.getOid() + " data: " + jsonData);
|
||||||
|
|
||||||
final Text key = new Text(workDataNoDoi.getOid());
|
final Text key = new Text(workDetail.getOid());
|
||||||
final Text value = new Text(jsonData);
|
final Text value = new Text(jsonData);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -4,10 +4,12 @@ package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -18,6 +20,7 @@ import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -30,14 +33,17 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.Work;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This spark job generates one parquet file, containing orcid publications dataset
|
* This spark job generates orcid publications no doi dataset
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class SparkGenEnrichedOrcidWorks {
|
public class SparkGenEnrichedOrcidWorks {
|
||||||
|
@ -53,47 +59,65 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
.toString(
|
.toString(
|
||||||
SparkGenEnrichedOrcidWorks.class
|
SparkGenEnrichedOrcidWorks.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
"/eu/dnetlib/dhp/doiboost/gen_orcid-no-doi_params.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
Boolean isSparkSessionManaged = Optional
|
Boolean isSparkSessionManaged = Optional
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
.map(Boolean::valueOf)
|
.map(Boolean::valueOf)
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
|
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
|
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
|
||||||
final String outputWorksPath = parser.get("outputWorksPath");
|
final String orcidDataFolder = parser.get("orcidDataFolder");
|
||||||
final String hdfsServerUri = parser.get("hdfsServerUri");
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||||
|
if (StringUtils.isBlank(lastUpdate)) {
|
||||||
|
throw new RuntimeException("last update info not found");
|
||||||
|
}
|
||||||
|
final String dateOfCollection = lastUpdate.substring(0, 10);
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> summariesRDD = sc
|
Dataset<AuthorData> authorDataset = spark
|
||||||
.sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
|
|
||||||
Dataset<AuthorData> summariesDataset = spark
|
|
||||||
.createDataset(
|
.createDataset(
|
||||||
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
sc
|
||||||
|
.textFile(workingPath.concat(orcidDataFolder).concat("/authors/*"))
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
|
||||||
|
.filter(authorSummary -> authorSummary.getAuthorData() != null)
|
||||||
|
.map(authorSummary -> authorSummary.getAuthorData())
|
||||||
|
.rdd(),
|
||||||
Encoders.bean(AuthorData.class));
|
Encoders.bean(AuthorData.class));
|
||||||
logger.info("Authors data loaded: " + summariesDataset.count());
|
logger.info("Authors data loaded: " + authorDataset.count());
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> activitiesRDD = sc
|
Dataset<WorkDetail> workDataset = spark
|
||||||
.sequenceFile(workingPath + outputWorksPath + "*.seq", Text.class, Text.class);
|
|
||||||
Dataset<WorkDataNoDoi> activitiesDataset = spark
|
|
||||||
.createDataset(
|
.createDataset(
|
||||||
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
|
sc
|
||||||
Encoders.bean(WorkDataNoDoi.class));
|
.textFile(workingPath.concat(orcidDataFolder).concat("/works/*"))
|
||||||
logger.info("Works data loaded: " + activitiesDataset.count());
|
.map(item -> OBJECT_MAPPER.readValue(item, Work.class))
|
||||||
|
.filter(work -> work.getWorkDetail() != null)
|
||||||
|
.map(work -> work.getWorkDetail())
|
||||||
|
.filter(work -> work.getErrorCode() == null)
|
||||||
|
.filter(
|
||||||
|
work -> work
|
||||||
|
.getExtIds()
|
||||||
|
.stream()
|
||||||
|
.filter(e -> e.getType() != null)
|
||||||
|
.noneMatch(e -> e.getType().equalsIgnoreCase("doi")))
|
||||||
|
.rdd(),
|
||||||
|
Encoders.bean(WorkDetail.class));
|
||||||
|
logger.info("Works data loaded: " + workDataset.count());
|
||||||
|
|
||||||
JavaRDD<Tuple2<String, String>> enrichedWorksRDD = activitiesDataset
|
JavaRDD<Tuple2<String, String>> enrichedWorksRDD = workDataset
|
||||||
.joinWith(
|
.joinWith(
|
||||||
summariesDataset,
|
authorDataset,
|
||||||
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
|
workDataset.col("oid").equalTo(authorDataset.col("oid")), "inner")
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, String>>) value -> {
|
(MapFunction<Tuple2<WorkDetail, AuthorData>, Tuple2<String, String>>) value -> {
|
||||||
WorkDataNoDoi w = value._1;
|
WorkDetail w = value._1;
|
||||||
AuthorData a = value._2;
|
AuthorData a = value._2;
|
||||||
AuthorMatcher.match(a, w.getContributors());
|
AuthorMatcher.match(a, w.getContributors());
|
||||||
return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w));
|
return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w));
|
||||||
|
@ -113,13 +137,25 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
.sparkContext()
|
.sparkContext()
|
||||||
.longAccumulator("errorsNotFoundAuthors");
|
.longAccumulator("errorsNotFoundAuthors");
|
||||||
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
|
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
|
||||||
|
final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound");
|
||||||
|
final LongAccumulator deactivatedAcc = spark.sparkContext().longAccumulator("deactivated_found");
|
||||||
|
final LongAccumulator titleNotProvidedAcc = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("Title_not_provided_found");
|
||||||
|
final LongAccumulator noUrlAcc = spark.sparkContext().longAccumulator("no_url_found");
|
||||||
|
|
||||||
final PublicationToOaf publicationToOaf = new PublicationToOaf(
|
final PublicationToOaf publicationToOaf = new PublicationToOaf(
|
||||||
parsedPublications,
|
parsedPublications,
|
||||||
enrichedPublications,
|
enrichedPublications,
|
||||||
errorsGeneric,
|
errorsGeneric,
|
||||||
errorsInvalidTitle,
|
errorsInvalidTitle,
|
||||||
errorsNotFoundAuthors,
|
errorsNotFoundAuthors,
|
||||||
errorsInvalidType);
|
errorsInvalidType,
|
||||||
|
otherTypeFound,
|
||||||
|
deactivatedAcc,
|
||||||
|
titleNotProvidedAcc,
|
||||||
|
noUrlAcc,
|
||||||
|
dateOfCollection);
|
||||||
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
|
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
|
||||||
.map(
|
.map(
|
||||||
e -> {
|
e -> {
|
||||||
|
@ -148,33 +184,10 @@ public class SparkGenEnrichedOrcidWorks {
|
||||||
logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
|
logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
|
||||||
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
|
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
|
||||||
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
|
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
|
||||||
|
logger.info("otherTypeFound: " + otherTypeFound.value().toString());
|
||||||
|
logger.info("deactivatedAcc: " + deactivatedAcc.value().toString());
|
||||||
|
logger.info("titleNotProvidedAcc: " + titleNotProvidedAcc.value().toString());
|
||||||
|
logger.info("noUrlAcc: " + noUrlAcc.value().toString());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
|
|
||||||
AuthorData authorData = new AuthorData();
|
|
||||||
authorData.setOid(orcidId.toString());
|
|
||||||
JsonElement jElement = new JsonParser().parse(json.toString());
|
|
||||||
authorData.setName(getJsonValue(jElement, "name"));
|
|
||||||
authorData.setSurname(getJsonValue(jElement, "surname"));
|
|
||||||
authorData.setCreditName(getJsonValue(jElement, "creditname"));
|
|
||||||
return authorData;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
|
|
||||||
|
|
||||||
WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
|
|
||||||
return workData;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String getJsonValue(JsonElement jElement, String property) {
|
|
||||||
if (jElement.getAsJsonObject().has(property)) {
|
|
||||||
JsonElement name = null;
|
|
||||||
name = jElement.getAsJsonObject().get(property);
|
|
||||||
if (name != null && !name.isJsonNull()) {
|
|
||||||
return name.getAsString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new String("");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,10 @@ public class JsonWriter {
|
||||||
return OBJECT_MAPPER.writeValueAsString(authorData);
|
return OBJECT_MAPPER.writeValueAsString(authorData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String create(Object obj) throws JsonProcessingException {
|
||||||
|
return OBJECT_MAPPER.writeValueAsString(obj);
|
||||||
|
}
|
||||||
|
|
||||||
public static String create(WorkData workData) {
|
public static String create(WorkData workData) {
|
||||||
JsonObject work = new JsonObject();
|
JsonObject work = new JsonObject();
|
||||||
work.addProperty("oid", workData.getOid());
|
work.addProperty("oid", workData.getOid());
|
||||||
|
|
|
@ -18,7 +18,6 @@ import com.google.gson.*;
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
|
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
||||||
|
@ -26,21 +25,28 @@ import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
||||||
/**
|
/**
|
||||||
* This class converts an orcid publication from json format to oaf
|
* This class converts an orcid publication from json format to oaf
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class PublicationToOaf implements Serializable {
|
public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
||||||
|
|
||||||
public static final String ORCID = StringUtils.upperCase(ModelConstants.ORCID);
|
|
||||||
public final static String orcidPREFIX = "orcid_______";
|
public final static String orcidPREFIX = "orcid_______";
|
||||||
public static final String OPENAIRE_PREFIX = "openaire____";
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
||||||
public static final String SEPARATOR = "::";
|
public static final String SEPARATOR = "::";
|
||||||
|
public static final String DEACTIVATED_NAME = "Given Names Deactivated";
|
||||||
|
public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
|
||||||
|
|
||||||
|
private String dateOfCollection = "";
|
||||||
private final LongAccumulator parsedPublications;
|
private final LongAccumulator parsedPublications;
|
||||||
private final LongAccumulator enrichedPublications;
|
private final LongAccumulator enrichedPublications;
|
||||||
private final LongAccumulator errorsGeneric;
|
private final LongAccumulator errorsGeneric;
|
||||||
private final LongAccumulator errorsInvalidTitle;
|
private final LongAccumulator errorsInvalidTitle;
|
||||||
private final LongAccumulator errorsNotFoundAuthors;
|
private final LongAccumulator errorsNotFoundAuthors;
|
||||||
private final LongAccumulator errorsInvalidType;
|
private final LongAccumulator errorsInvalidType;
|
||||||
|
private final LongAccumulator otherTypeFound;
|
||||||
|
private final LongAccumulator deactivatedAcc;
|
||||||
|
private final LongAccumulator titleNotProvidedAcc;
|
||||||
|
private final LongAccumulator noUrlAcc;
|
||||||
|
|
||||||
public PublicationToOaf(
|
public PublicationToOaf(
|
||||||
LongAccumulator parsedPublications,
|
LongAccumulator parsedPublications,
|
||||||
|
@ -48,13 +54,23 @@ public class PublicationToOaf implements Serializable {
|
||||||
LongAccumulator errorsGeneric,
|
LongAccumulator errorsGeneric,
|
||||||
LongAccumulator errorsInvalidTitle,
|
LongAccumulator errorsInvalidTitle,
|
||||||
LongAccumulator errorsNotFoundAuthors,
|
LongAccumulator errorsNotFoundAuthors,
|
||||||
LongAccumulator errorsInvalidType) {
|
LongAccumulator errorsInvalidType,
|
||||||
|
LongAccumulator otherTypeFound,
|
||||||
|
LongAccumulator deactivatedAcc,
|
||||||
|
LongAccumulator titleNotProvidedAcc,
|
||||||
|
LongAccumulator noUrlAcc,
|
||||||
|
String dateOfCollection) {
|
||||||
this.parsedPublications = parsedPublications;
|
this.parsedPublications = parsedPublications;
|
||||||
this.enrichedPublications = enrichedPublications;
|
this.enrichedPublications = enrichedPublications;
|
||||||
this.errorsGeneric = errorsGeneric;
|
this.errorsGeneric = errorsGeneric;
|
||||||
this.errorsInvalidTitle = errorsInvalidTitle;
|
this.errorsInvalidTitle = errorsInvalidTitle;
|
||||||
this.errorsNotFoundAuthors = errorsNotFoundAuthors;
|
this.errorsNotFoundAuthors = errorsNotFoundAuthors;
|
||||||
this.errorsInvalidType = errorsInvalidType;
|
this.errorsInvalidType = errorsInvalidType;
|
||||||
|
this.otherTypeFound = otherTypeFound;
|
||||||
|
this.deactivatedAcc = deactivatedAcc;
|
||||||
|
this.titleNotProvidedAcc = titleNotProvidedAcc;
|
||||||
|
this.noUrlAcc = noUrlAcc;
|
||||||
|
this.dateOfCollection = dateOfCollection;
|
||||||
}
|
}
|
||||||
|
|
||||||
public PublicationToOaf() {
|
public PublicationToOaf() {
|
||||||
|
@ -64,12 +80,19 @@ public class PublicationToOaf implements Serializable {
|
||||||
this.errorsInvalidTitle = null;
|
this.errorsInvalidTitle = null;
|
||||||
this.errorsNotFoundAuthors = null;
|
this.errorsNotFoundAuthors = null;
|
||||||
this.errorsInvalidType = null;
|
this.errorsInvalidType = null;
|
||||||
|
this.otherTypeFound = null;
|
||||||
|
this.deactivatedAcc = null;
|
||||||
|
this.titleNotProvidedAcc = null;
|
||||||
|
this.noUrlAcc = null;
|
||||||
|
this.dateOfCollection = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
||||||
|
|
||||||
{
|
{
|
||||||
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + ModelConstants.ORCID));
|
put(
|
||||||
|
ModelConstants.ORCID,
|
||||||
|
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -79,9 +102,9 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
{
|
{
|
||||||
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
||||||
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
put("arxiv".toLowerCase(), new Pair<>("arXiv", "arXiv"));
|
||||||
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
put("pmc".toLowerCase(), new Pair<>("pmc", "PubMed Central ID"));
|
||||||
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
put("pmid".toLowerCase(), new Pair<>("pmid", "PubMed ID"));
|
||||||
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcid workid"));
|
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcid workid"));
|
||||||
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
||||||
}
|
}
|
||||||
|
@ -102,21 +125,15 @@ public class PublicationToOaf implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final String PID_TYPES = "dnet:pid_types";
|
||||||
|
|
||||||
public Oaf generatePublicationActionsFromJson(final String json) {
|
public Oaf generatePublicationActionsFromJson(final String json) {
|
||||||
try {
|
|
||||||
if (parsedPublications != null) {
|
if (parsedPublications != null) {
|
||||||
parsedPublications.add(1);
|
parsedPublications.add(1);
|
||||||
}
|
}
|
||||||
JsonElement jElement = new JsonParser().parse(json);
|
JsonElement jElement = new JsonParser().parse(json);
|
||||||
JsonObject jObject = jElement.getAsJsonObject();
|
JsonObject jObject = jElement.getAsJsonObject();
|
||||||
return generatePublicationActionsFromDump(jObject);
|
return generatePublicationActionsFromDump(jObject);
|
||||||
} catch (Throwable t) {
|
|
||||||
logger.error("creating publication: " + t.getMessage());
|
|
||||||
if (errorsGeneric != null) {
|
|
||||||
errorsGeneric.add(1);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
|
public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
|
||||||
|
@ -142,7 +159,7 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
publication.setLastupdatetimestamp(new Date().getTime());
|
publication.setLastupdatetimestamp(new Date().getTime());
|
||||||
|
|
||||||
publication.setDateofcollection("2020-10-14");
|
publication.setDateofcollection(dateOfCollection);
|
||||||
publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
|
publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
|
||||||
|
|
||||||
// Adding external ids
|
// Adding external ids
|
||||||
|
@ -150,8 +167,8 @@ public class PublicationToOaf implements Serializable {
|
||||||
.keySet()
|
.keySet()
|
||||||
.stream()
|
.stream()
|
||||||
.forEach(jsonExtId -> {
|
.forEach(jsonExtId -> {
|
||||||
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
final String classid = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
||||||
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
final String classname = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
||||||
final String extId = getStringValue(rootElement, jsonExtId);
|
final String extId = getStringValue(rootElement, jsonExtId);
|
||||||
if (StringUtils.isNotBlank(extId)) {
|
if (StringUtils.isNotBlank(extId)) {
|
||||||
publication
|
publication
|
||||||
|
@ -182,11 +199,19 @@ public class PublicationToOaf implements Serializable {
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
|
||||||
|
if (titleNotProvidedAcc != null) {
|
||||||
|
titleNotProvidedAcc.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
publication
|
publication
|
||||||
.setTitle(
|
.setTitle(
|
||||||
titles
|
titles
|
||||||
.stream()
|
.stream()
|
||||||
.map(t -> mapStructuredProperty(t, ModelConstants.MAIN_TITLE_QUALIFIER, null))
|
.map(t -> {
|
||||||
|
return mapStructuredProperty(t, ModelConstants.MAIN_TITLE_QUALIFIER, null);
|
||||||
|
})
|
||||||
.filter(s -> s != null)
|
.filter(s -> s != null)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
// Adding identifier
|
// Adding identifier
|
||||||
|
@ -216,8 +241,23 @@ public class PublicationToOaf implements Serializable {
|
||||||
mapQualifier(
|
mapQualifier(
|
||||||
type, type, ModelConstants.DNET_DATA_CITE_RESOURCE, ModelConstants.DNET_DATA_CITE_RESOURCE));
|
type, type, ModelConstants.DNET_DATA_CITE_RESOURCE, ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||||
|
|
||||||
|
Map<String, String> publicationType = typologiesMapping.get(type);
|
||||||
|
if ((publicationType == null || publicationType.isEmpty()) && errorsInvalidType != null) {
|
||||||
|
errorsInvalidType.add(1);
|
||||||
|
logger.error("publication_type_not_found: " + type);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final String typeValue = typologiesMapping.get(type).get("value");
|
final String typeValue = typologiesMapping.get(type).get("value");
|
||||||
cobjValue = typologiesMapping.get(type).get("cobj");
|
cobjValue = typologiesMapping.get(type).get("cobj");
|
||||||
|
// this dataset must contain only publication
|
||||||
|
if (cobjValue.equals("0020")) {
|
||||||
|
if (otherTypeFound != null) {
|
||||||
|
otherTypeFound.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
final Instance instance = new Instance();
|
final Instance instance = new Instance();
|
||||||
|
|
||||||
// Adding hostedby
|
// Adding hostedby
|
||||||
|
@ -228,8 +268,13 @@ public class PublicationToOaf implements Serializable {
|
||||||
if (urls != null && !urls.isEmpty()) {
|
if (urls != null && !urls.isEmpty()) {
|
||||||
instance.setUrl(urls);
|
instance.setUrl(urls);
|
||||||
} else {
|
} else {
|
||||||
dataInfo.setInvisible(true);
|
if (noUrlAcc != null) {
|
||||||
|
noUrlAcc.add(1);
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
dataInfo.setInvisible(true);
|
||||||
|
|
||||||
final String pubDate = getPublicationDate(rootElement, "publicationDates");
|
final String pubDate = getPublicationDate(rootElement, "publicationDates");
|
||||||
if (StringUtils.isNotBlank(pubDate)) {
|
if (StringUtils.isNotBlank(pubDate)) {
|
||||||
|
@ -241,11 +286,9 @@ public class PublicationToOaf implements Serializable {
|
||||||
// Adding accessright
|
// Adding accessright
|
||||||
instance
|
instance
|
||||||
.setAccessright(
|
.setAccessright(
|
||||||
OafUtils
|
OafMapperUtils
|
||||||
.createAccessRight(
|
.accessRight(
|
||||||
ModelConstants.UNKNOWN,
|
ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_ACCESS_MODES,
|
||||||
ModelConstants.UNKNOWN,
|
|
||||||
ModelConstants.DNET_ACCESS_MODES,
|
|
||||||
ModelConstants.DNET_ACCESS_MODES));
|
ModelConstants.DNET_ACCESS_MODES));
|
||||||
|
|
||||||
// Adding type
|
// Adding type
|
||||||
|
@ -266,13 +309,29 @@ public class PublicationToOaf implements Serializable {
|
||||||
// Adding authors
|
// Adding authors
|
||||||
final List<Author> authors = createAuthors(rootElement);
|
final List<Author> authors = createAuthors(rootElement);
|
||||||
if (authors != null && authors.size() > 0) {
|
if (authors != null && authors.size() > 0) {
|
||||||
|
if (authors.stream().filter(a -> {
|
||||||
|
return ((Objects.nonNull(a.getName()) && a.getName().equals(DEACTIVATED_NAME)) ||
|
||||||
|
(Objects.nonNull(a.getSurname()) && a.getSurname().equals(DEACTIVATED_SURNAME)));
|
||||||
|
}).count() > 0) {
|
||||||
|
if (deactivatedAcc != null) {
|
||||||
|
deactivatedAcc.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
publication.setAuthor(authors);
|
publication.setAuthor(authors);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (authors == null) {
|
||||||
|
Gson gson = new GsonBuilder().setPrettyPrinting().create();
|
||||||
|
String json = gson.toJson(rootElement);
|
||||||
|
throw new RuntimeException("not_valid_authors: " + json);
|
||||||
} else {
|
} else {
|
||||||
if (errorsNotFoundAuthors != null) {
|
if (errorsNotFoundAuthors != null) {
|
||||||
errorsNotFoundAuthors.add(1);
|
errorsNotFoundAuthors.add(1);
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
String classValue = getDefaultResulttype(cobjValue);
|
String classValue = getDefaultResulttype(cobjValue);
|
||||||
publication
|
publication
|
||||||
.setResulttype(
|
.setResulttype(
|
||||||
|
@ -518,36 +577,33 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
private KeyValue createCollectedFrom() {
|
private KeyValue createCollectedFrom() {
|
||||||
KeyValue cf = new KeyValue();
|
KeyValue cf = new KeyValue();
|
||||||
cf.setValue(ORCID);
|
cf.setValue(ModelConstants.ORCID.toUpperCase());
|
||||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
||||||
return cf;
|
return cf;
|
||||||
}
|
}
|
||||||
|
|
||||||
private KeyValue createHostedBy() {
|
private KeyValue createHostedBy() {
|
||||||
KeyValue hb = new KeyValue();
|
return ModelConstants.UNKNOWN_REPOSITORY;
|
||||||
hb.setValue("Unknown Repository");
|
|
||||||
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
|
|
||||||
return hb;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private StructuredProperty mapAuthorId(String orcidId) {
|
private StructuredProperty mapAuthorId(String orcidId) {
|
||||||
final StructuredProperty sp = new StructuredProperty();
|
final StructuredProperty sp = new StructuredProperty();
|
||||||
sp.setValue(orcidId);
|
sp.setValue(orcidId);
|
||||||
final Qualifier q = new Qualifier();
|
final Qualifier q = new Qualifier();
|
||||||
q.setClassid(ORCID.toLowerCase());
|
q.setClassid(ModelConstants.ORCID);
|
||||||
q.setClassname(ORCID.toLowerCase());
|
q.setClassname(ModelConstants.ORCID_CLASSNAME);
|
||||||
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||||
q.setSchemename(ModelConstants.DNET_PID_TYPES);
|
q.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||||
sp.setQualifier(q);
|
sp.setQualifier(q);
|
||||||
final DataInfo dataInfo = new DataInfo();
|
final DataInfo dataInfo = new DataInfo();
|
||||||
dataInfo.setDeletedbyinference(false);
|
dataInfo.setDeletedbyinference(false);
|
||||||
dataInfo.setInferred(false);
|
dataInfo.setInferred(false);
|
||||||
dataInfo.setTrust("0.9");
|
dataInfo.setTrust("0.91");
|
||||||
dataInfo
|
dataInfo
|
||||||
.setProvenanceaction(
|
.setProvenanceaction(
|
||||||
mapQualifier(
|
mapQualifier(
|
||||||
"sysimport:crosswalk:entityregistry",
|
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||||
"Harvested",
|
ModelConstants.HARVESTED,
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||||
sp.setDataInfo(dataInfo);
|
sp.setDataInfo(dataInfo);
|
||||||
|
|
|
@ -0,0 +1,77 @@
|
||||||
|
diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java (rejected hunks)
|
||||||
|
@@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
|
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
||||||
|
|
||||||
|
- public static final String ORCID = "ORCID";
|
||||||
|
- public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID";
|
||||||
|
public final static String orcidPREFIX = "orcid_______";
|
||||||
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
||||||
|
public static final String SEPARATOR = "::";
|
||||||
|
+ public static final String DEACTIVATED_NAME = "Given Names Deactivated";
|
||||||
|
+ public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
|
||||||
|
|
||||||
|
private String dateOfCollection = "";
|
||||||
|
private final LongAccumulator parsedPublications;
|
||||||
|
@@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
this.errorsNotFoundAuthors = null;
|
||||||
|
this.errorsInvalidType = null;
|
||||||
|
this.otherTypeFound = null;
|
||||||
|
+ this.deactivatedAcc = null;
|
||||||
|
+ this.titleNotProvidedAcc = null;
|
||||||
|
+ this.noUrlAcc = null;
|
||||||
|
this.dateOfCollection = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
||||||
|
|
||||||
|
{
|
||||||
|
- put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||||
|
+ put(
|
||||||
|
+ ModelConstants.ORCID,
|
||||||
|
+ new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
@@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
+ if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
|
||||||
|
+ if (titleNotProvidedAcc != null) {
|
||||||
|
+ titleNotProvidedAcc.add(1);
|
||||||
|
+ }
|
||||||
|
+ return null;
|
||||||
|
+ }
|
||||||
|
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||||
|
publication
|
||||||
|
.setTitle(
|
||||||
|
@@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
|
private KeyValue createCollectedFrom() {
|
||||||
|
KeyValue cf = new KeyValue();
|
||||||
|
- cf.setValue(ORCID);
|
||||||
|
+ cf.setValue(ModelConstants.ORCID.toUpperCase());
|
||||||
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
||||||
|
return cf;
|
||||||
|
}
|
||||||
|
|
||||||
|
private KeyValue createHostedBy() {
|
||||||
|
- KeyValue hb = new KeyValue();
|
||||||
|
- hb.setValue("Unknown Repository");
|
||||||
|
- hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
|
||||||
|
- return hb;
|
||||||
|
+ return ModelConstants.UNKNOWN_REPOSITORY;
|
||||||
|
}
|
||||||
|
|
||||||
|
private StructuredProperty mapAuthorId(String orcidId) {
|
||||||
|
final StructuredProperty sp = new StructuredProperty();
|
||||||
|
sp.setValue(orcidId);
|
||||||
|
final Qualifier q = new Qualifier();
|
||||||
|
- q.setClassid(ORCID.toLowerCase());
|
||||||
|
- q.setClassname(ORCID_PID_TYPE_CLASSNAME);
|
||||||
|
+ q.setClassid(ModelConstants.ORCID);
|
||||||
|
+ q.setClassname(ModelConstants.ORCID_CLASSNAME);
|
||||||
|
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||||
|
q.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||||
|
sp.setQualifier(q);
|
|
@ -19,8 +19,8 @@ import com.ximpleware.XPathParseException;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
import eu.dnetlib.dhp.schema.orcid.Contributor;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class is used for searching from a list of publication contributors a
|
* This class is used for searching from a list of publication contributors a
|
||||||
|
@ -209,7 +209,7 @@ public class AuthorMatcher {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String toJson(WorkDataNoDoi work) {
|
private static String toJson(WorkDetail work) {
|
||||||
GsonBuilder builder = new GsonBuilder();
|
GsonBuilder builder = new GsonBuilder();
|
||||||
Gson gson = builder.create();
|
Gson gson = builder.create();
|
||||||
return gson.toJson(work);
|
return gson.toJson(work);
|
||||||
|
|
|
@ -12,10 +12,10 @@ import com.ximpleware.*;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
import eu.dnetlib.dhp.schema.orcid.Contributor;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
|
import eu.dnetlib.dhp.schema.orcid.ExternalId;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
|
import eu.dnetlib.dhp.schema.orcid.PublicationDate;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class is used for parsing xml data with vtd parser
|
* This class is used for parsing xml data with vtd parser
|
||||||
|
@ -42,7 +42,7 @@ public class XMLRecordParserNoDoi {
|
||||||
|
|
||||||
private static final String NS_ERROR = "error";
|
private static final String NS_ERROR = "error";
|
||||||
|
|
||||||
public static WorkDataNoDoi VTDParseWorkData(byte[] bytes)
|
public static WorkDetail VTDParseWorkData(byte[] bytes)
|
||||||
throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
|
throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
|
||||||
NavException, XPathEvalException {
|
NavException, XPathEvalException {
|
||||||
final VTDGen vg = new VTDGen();
|
final VTDGen vg = new VTDGen();
|
||||||
|
@ -54,7 +54,7 @@ public class XMLRecordParserNoDoi {
|
||||||
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
|
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
|
||||||
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
||||||
|
|
||||||
WorkDataNoDoi workData = new WorkDataNoDoi();
|
WorkDetail workData = new WorkDetail();
|
||||||
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
||||||
if (!errors.isEmpty()) {
|
if (!errors.isEmpty()) {
|
||||||
workData.setErrorCode(errors.get(0));
|
workData.setErrorCode(errors.get(0));
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
[{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path", "paramRequired": true},
|
[{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path", "paramRequired": true},
|
||||||
|
{"paramName":"a", "paramLongName":"authorsPath", "paramDescription": "the path of the authors seq file", "paramRequired": true},
|
||||||
|
{"paramName":"xw", "paramLongName":"xmlWorksPath", "paramDescription": "the path of the works xml seq file", "paramRequired": true},
|
||||||
{"paramName":"o", "paramLongName":"outputDoiAuthorListPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
{"paramName":"o", "paramLongName":"outputDoiAuthorListPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -1,7 +1,6 @@
|
||||||
[
|
[
|
||||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
{"paramName":"i", "paramLongName":"orcidDataFolder", "paramDescription": "the folder of orcid data", "paramRequired": true},
|
||||||
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
|
||||||
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -1,18 +0,0 @@
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>jobTracker</name>
|
|
||||||
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nameNode</name>
|
|
||||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>queueName</name>
|
|
||||||
<value>default</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>spark2</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
|
@ -1,9 +1,5 @@
|
||||||
<workflow-app name="Gen_Doi_Author_List_WF" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="gen_doi_author_list" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
|
||||||
<name>workingPath</name>
|
|
||||||
<description>the working dir base path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>memory for driver process</description>
|
<description>memory for driver process</description>
|
||||||
|
@ -16,10 +12,50 @@
|
||||||
<name>sparkExecutorCores</name>
|
<name>sparkExecutorCores</name>
|
||||||
<description>number of cores used by single executor</description>
|
<description>number of cores used by single executor</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2MaxExecutors</name>
|
||||||
|
<value>20</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
@ -27,24 +63,32 @@
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
<action name="ResetWorkingPath">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path='${workingPath_activities}/doi_author_list'/>
|
<delete path='${workingPath}/doi_author_list'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="Gen_Doi_Author_List"/>
|
<ok to="GenDoiAuthorList"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="Gen_Doi_Author_List">
|
<action name="GenDoiAuthorList">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<master>yarn-cluster</master>
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Gen_Doi_Author_List</name>
|
<name>GenDoiAuthorList</name>
|
||||||
<class>eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList</class>
|
<class>eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList</class>
|
||||||
<jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-a</arg><arg>authors/authors.seq</arg>
|
||||||
|
<arg>-xw</arg><arg>xml/works/*.seq</arg>
|
||||||
<arg>-o</arg><arg>doi_author_list/</arg>
|
<arg>-o</arg><arg>doi_author_list/</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -0,0 +1,163 @@
|
||||||
|
<workflow-app name="update_orcid_datasets" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>spark2MaxExecutors</name>
|
||||||
|
<value>50</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="promoteOrcidAuthorsDataset"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/orcid_dataset/new_authors'/>
|
||||||
|
<delete path='${workingPath}/orcid_dataset/new_works'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="UpdateOrcidAuthors"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="UpdateOrcidAuthors">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>UpdateOrcidAuthors</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidAuthors</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>-</arg>
|
||||||
|
<arg>-o</arg><arg>-</arg>
|
||||||
|
<arg>-t</arg><arg>-</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="UpdateOrcidWorks"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="UpdateOrcidWorks">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>UpdateOrcidWorks</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidWorks</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>-</arg>
|
||||||
|
<arg>-o</arg><arg>-</arg>
|
||||||
|
<arg>-t</arg><arg>-</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="promoteOrcidAuthorsDataset">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<prepare>
|
||||||
|
<delete path="${workingPath}/orcid_dataset/authors"/>
|
||||||
|
<mkdir path="${workingPath}/orcid_dataset/authors"/>
|
||||||
|
</prepare>
|
||||||
|
<arg>${workingPath}/orcid_dataset/new_authors/*</arg>
|
||||||
|
<arg>${workingPath}/orcid_dataset/authors</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="promoteOrcidWorksDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="promoteOrcidWorksDataset">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<prepare>
|
||||||
|
<delete path="${workingPath}/orcid_dataset/works"/>
|
||||||
|
<mkdir path="${workingPath}/orcid_dataset/works"/>
|
||||||
|
</prepare>
|
||||||
|
<arg>${workingPath}/orcid_dataset/new_works/*</arg>
|
||||||
|
<arg>${workingPath}/orcid_dataset/works</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="CleanWorkingPath"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="CleanWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/orcid_dataset/new_authors'/>
|
||||||
|
<delete path='${workingPath}/orcid_dataset/new_works'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -1,22 +0,0 @@
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>jobTracker</name>
|
|
||||||
<value>yarnRM</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nameNode</name>
|
|
||||||
<value>hdfs://nameservice1</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.java</name>
|
|
||||||
<value>spark2</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
|
||||||
<value>-Xmx4g</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
|
@ -1,9 +1,25 @@
|
||||||
<workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>spark2UpdateStepMaxExecutors</name>
|
||||||
|
<value>50</value>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>workingPath</name>
|
||||||
<description>the working dir base path</description>
|
<description>the working dir base path</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.java</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||||
|
<value>-Xmx4g</value>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>token</name>
|
<name>token</name>
|
||||||
<description>access token</description>
|
<description>access token</description>
|
||||||
|
@ -30,7 +46,7 @@
|
||||||
<description>number of cores used by single executor</description>
|
<description>number of cores used by single executor</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>spark2MaxExecutors</name>
|
<name>spark2DownloadingMaxExecutors</name>
|
||||||
<value>10</value>
|
<value>10</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
|
@ -58,6 +74,8 @@
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<global>
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
<configuration>
|
<configuration>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
@ -66,18 +84,16 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="DownloadOrcidAuthors"/>
|
<start to="ResetLambda"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
<action name="ResetLambda">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path='${workingPath}/downloads'/>
|
|
||||||
<delete path='${workingPath}/last_modified.csv.tar'/>
|
<delete path='${workingPath}/last_modified.csv.tar'/>
|
||||||
<mkdir path='${workingPath}/downloads'/>
|
<delete path='${workingPath}/last_modified.seq'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="DownloadLambdaFile"/>
|
<ok to="DownloadLambdaFile"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -92,22 +108,7 @@
|
||||||
<argument>${shell_cmd}</argument>
|
<argument>${shell_cmd}</argument>
|
||||||
<capture-output/>
|
<capture-output/>
|
||||||
</shell>
|
</shell>
|
||||||
<ok to="DownloadUpdatedXMLAuthors"/>
|
<ok to="GenLastModifiedSeq"/>
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="DownloadUpdatedXMLAuthors">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
|
|
||||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>last_modified.csv.tar</arg>
|
|
||||||
<arg>-o</arg><arg>downloads/</arg>
|
|
||||||
<arg>-t</arg><arg>${token}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -133,7 +134,16 @@
|
||||||
<arg>-o</arg><arg>last_modified.seq</arg>
|
<arg>-o</arg><arg>last_modified.seq</arg>
|
||||||
<arg>-t</arg><arg>-</arg>
|
<arg>-t</arg><arg>-</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="ResetDownloads"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ResetDownloads">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/downloads/updated_authors'/>
|
||||||
|
<delete path='${workingPath}/downloads/updated_works'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="DownloadOrcidAuthors"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -146,7 +156,7 @@
|
||||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2DownloadingMaxExecutors}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
@ -160,6 +170,148 @@
|
||||||
<arg>-o</arg><arg>downloads/updated_authors</arg>
|
<arg>-o</arg><arg>downloads/updated_authors</arg>
|
||||||
<arg>-t</arg><arg>${token}</arg>
|
<arg>-t</arg><arg>${token}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
<ok to="DownloadOrcidWorks"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="DownloadOrcidWorks">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>DownloadOrcidWorks</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcid.SparkDownloadOrcidWorks</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2DownloadingMaxExecutors}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>-</arg>
|
||||||
|
<arg>-o</arg><arg>downloads/updated_works</arg>
|
||||||
|
<arg>-t</arg><arg>${token}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="UpdateOrcidAuthors"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="UpdateOrcidAuthors">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>UpdateOrcidAuthors</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidAuthors</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2UpdateStepMaxExecutors}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>-</arg>
|
||||||
|
<arg>-o</arg><arg>-</arg>
|
||||||
|
<arg>-t</arg><arg>-</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="UpdateOrcidWorks"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="UpdateOrcidWorks">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>UpdateOrcidWorks</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidWorks</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2UpdateStepMaxExecutors}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>-</arg>
|
||||||
|
<arg>-o</arg><arg>-</arg>
|
||||||
|
<arg>-t</arg><arg>-</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="promoteOrcidAuthorsDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="promoteOrcidAuthorsDataset">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<prepare>
|
||||||
|
<delete path="${workingPath}/orcid_dataset/authors"/>
|
||||||
|
<mkdir path="${workingPath}/orcid_dataset/authors"/>
|
||||||
|
</prepare>
|
||||||
|
<arg>${workingPath}/orcid_dataset/new_authors/*</arg>
|
||||||
|
<arg>${workingPath}/orcid_dataset/authors</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="promoteOrcidWorksDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="promoteOrcidWorksDataset">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<prepare>
|
||||||
|
<delete path="${workingPath}/orcid_dataset/works"/>
|
||||||
|
<mkdir path="${workingPath}/orcid_dataset/works"/>
|
||||||
|
</prepare>
|
||||||
|
<arg>${workingPath}/orcid_dataset/new_works/*</arg>
|
||||||
|
<arg>${workingPath}/orcid_dataset/works</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="CleanWorkingPath"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="CleanWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/orcid_dataset/new_authors'/>
|
||||||
|
<delete path='${workingPath}/orcid_dataset/new_works'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="updateLastOrcidAuthorsDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="updateLastOrcidAuthorsDataset">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<prepare>
|
||||||
|
<delete path="${workingPath}/last_orcid_dataset/authors"/>
|
||||||
|
<mkdir path="${workingPath}/last_orcid_dataset/authors"/>
|
||||||
|
</prepare>
|
||||||
|
<arg>${workingPath}/orcid_dataset/authors/*</arg>
|
||||||
|
<arg>${workingPath}/last_orcid_dataset/authors</arg>
|
||||||
|
</distcp>
|
||||||
|
<ok to="updateLastOrcidWorksDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="updateLastOrcidWorksDataset">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<prepare>
|
||||||
|
<delete path="${workingPath}/last_orcid_dataset/works"/>
|
||||||
|
<mkdir path="${workingPath}/last_orcid_dataset/works"/>
|
||||||
|
</prepare>
|
||||||
|
<arg>${workingPath}/orcid_dataset/works/*</arg>
|
||||||
|
<arg>${workingPath}/last_orcid_dataset/works</arg>
|
||||||
|
</distcp>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
|
@ -1,19 +1,9 @@
|
||||||
{
|
{
|
||||||
"reference-entry": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
|
||||||
"report": {"cobj":"0017", "value": "Report"},
|
"report": {"cobj":"0017", "value": "Report"},
|
||||||
"dataset": {"cobj":"0021", "value": "Dataset"},
|
|
||||||
"journal-article": {"cobj":"0001", "value": "Article"},
|
"journal-article": {"cobj":"0001", "value": "Article"},
|
||||||
"reference-book": {"cobj":"0002", "value": "Book"},
|
|
||||||
"other": {"cobj":"0020", "value": "Other ORP type"},
|
"other": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
"proceedings-article": {"cobj":"0004", "value": "Conference object"},
|
|
||||||
"standard": {"cobj":"0038", "value": "Other literature type"},
|
|
||||||
"book-part": {"cobj":"0002", "value": "Book"},
|
|
||||||
"monograph": {"cobj":"0002", "value": "Book"},
|
|
||||||
"report-series": {"cobj":"0017", "value": "Report"},
|
|
||||||
"book": {"cobj":"0002", "value": "Book"},
|
"book": {"cobj":"0002", "value": "Book"},
|
||||||
"book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
"book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
||||||
"peer-review": {"cobj":"0015", "value": "Review"},
|
|
||||||
"book-section": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
|
||||||
"book-review": {"cobj":"0015", "value": "Review"},
|
"book-review": {"cobj":"0015", "value": "Review"},
|
||||||
"conference-abstract": {"cobj":"0004", "value": "Conference object"},
|
"conference-abstract": {"cobj":"0004", "value": "Conference object"},
|
||||||
"conference-paper": {"cobj":"0004", "value": "Conference object"},
|
"conference-paper": {"cobj":"0004", "value": "Conference object"},
|
||||||
|
@ -21,7 +11,7 @@
|
||||||
"data-set": {"cobj":"0021", "value": "Dataset"},
|
"data-set": {"cobj":"0021", "value": "Dataset"},
|
||||||
"dictionary-entry": {"cobj":"0038", "value": "Other literature type"},
|
"dictionary-entry": {"cobj":"0038", "value": "Other literature type"},
|
||||||
"disclosure": {"cobj":"0038", "value": "Other literature type"},
|
"disclosure": {"cobj":"0038", "value": "Other literature type"},
|
||||||
"dissertation": {"cobj":"0006", "value": "Doctoral thesis"},
|
"dissertation-thesis": {"cobj":"0006", "value": "Doctoral thesis"},
|
||||||
"edited-book": {"cobj":"0002", "value": "Book"},
|
"edited-book": {"cobj":"0002", "value": "Book"},
|
||||||
"encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"},
|
"encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"},
|
||||||
"lecture-speech": {"cobj":"0010", "value": "Lecture"},
|
"lecture-speech": {"cobj":"0010", "value": "Lecture"},
|
||||||
|
@ -37,5 +27,17 @@
|
||||||
"supervised-student-publication": {"cobj":"0001", "value": "Article"},
|
"supervised-student-publication": {"cobj":"0001", "value": "Article"},
|
||||||
"technical-standard": {"cobj":"0038", "value": "Other literature type"},
|
"technical-standard": {"cobj":"0038", "value": "Other literature type"},
|
||||||
"website": {"cobj":"0020", "value": "Other ORP type"},
|
"website": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
"working-paper": {"cobj":"0014", "value": "Research"}
|
"working-paper": {"cobj":"0014", "value": "Research"},
|
||||||
|
"annotation": {"cobj":"0018", "value": "Annotation"},
|
||||||
|
"physical-object": {"cobj":"0028", "value": "PhysicalObject"},
|
||||||
|
"preprint": {"cobj":"0016", "value": "Preprint"},
|
||||||
|
"software": {"cobj":"0029", "value": "Software"},
|
||||||
|
"journal-issue": {"cobj":"0001", "value": "Article"},
|
||||||
|
"translation": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"artistic-performance": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"online-resource": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"registered-copyright": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"trademark": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"invention": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"spin-off-company": {"cobj":"0020", "value": "Other ORP type"}
|
||||||
}
|
}
|
|
@ -1,17 +1,18 @@
|
||||||
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>spark2GenNoDoiDatasetMaxExecutors</name>
|
||||||
|
<value>40</value>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>memory for driver process</description>
|
<description>memory for driver process</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkExecutorMemory</name>
|
<name>spark2GenNoDoiDatasetExecutorMemory</name>
|
||||||
|
<value>2G</value>
|
||||||
<description>memory for individual executor</description>
|
<description>memory for individual executor</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>oozieActionShareLibForSpark2</name>
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
<description>oozie action sharelib for spark 2.*</description>
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
@ -73,8 +74,9 @@
|
||||||
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
|
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
|
||||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
--executor-cores=${sparkExecutorCores}
|
--conf spark.dynamicAllocation.maxExecutors=${spark2GenNoDoiDatasetMaxExecutors}
|
||||||
|
--executor-memory=${spark2GenNoDoiDatasetExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -83,8 +85,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
<arg>-f</arg><arg>-</arg>
|
<arg>-i</arg><arg>last_orcid_dataset</arg>
|
||||||
<arg>-ow</arg><arg>no_doi_works/</arg>
|
|
||||||
<arg>-oew</arg><arg>no_doi_dataset</arg>
|
<arg>-oew</arg><arg>no_doi_dataset</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -10,30 +10,28 @@ import java.nio.file.Paths;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.time.Duration;
|
import java.util.*;
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.time.temporal.TemporalUnit;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||||
|
import org.apache.commons.compress.utils.Lists;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.mortbay.log.Log;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParserTest;
|
||||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class OrcidClientTest {
|
public class OrcidClientTest {
|
||||||
final String orcidId = "0000-0001-7291-3210";
|
|
||||||
final int REQ_LIMIT = 24;
|
final int REQ_LIMIT = 24;
|
||||||
final int REQ_MAX_TEST = 100;
|
final int REQ_MAX_TEST = 100;
|
||||||
final int RECORD_DOWNLOADED_COUNTER_LOG_INTERVAL = 10;
|
final int RECORD_DOWNLOADED_COUNTER_LOG_INTERVAL = 10;
|
||||||
|
@ -42,69 +40,45 @@ public class OrcidClientTest {
|
||||||
String toNotRetrieveDate = "2019-09-29 23:59:59.000000";
|
String toNotRetrieveDate = "2019-09-29 23:59:59.000000";
|
||||||
String lastUpdate = "2019-09-30 00:00:00";
|
String lastUpdate = "2019-09-30 00:00:00";
|
||||||
String shortDate = "2020-05-06 16:06:11";
|
String shortDate = "2020-05-06 16:06:11";
|
||||||
|
final String REQUEST_TYPE_RECORD = "record";
|
||||||
|
final String REQUEST_TYPE_WORK = "work/47652866";
|
||||||
|
final String REQUEST_TYPE_WORKS = "works";
|
||||||
|
|
||||||
|
private static Path testPath;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
private static void setUp() throws IOException {
|
||||||
|
testPath = Files.createTempDirectory(XMLRecordParserTest.class.getName());
|
||||||
|
System.out.println("using test path: " + testPath);
|
||||||
|
}
|
||||||
|
|
||||||
// curl -i -H "Accept: application/vnd.orcid+xml"
|
// curl -i -H "Accept: application/vnd.orcid+xml"
|
||||||
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
|
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
|
||||||
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
|
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
private void multipleDownloadTest() throws Exception {
|
public void downloadTest() throws Exception {
|
||||||
int toDownload = 10;
|
final String orcid = "0000-0001-7291-3210";
|
||||||
long start = System.currentTimeMillis();
|
String record = testDownloadRecord(orcid, REQUEST_TYPE_RECORD);
|
||||||
OrcidDownloader downloader = new OrcidDownloader();
|
String filename = testPath + "/downloaded_record_".concat(orcid).concat(".xml");
|
||||||
TarArchiveInputStream input = new TarArchiveInputStream(
|
|
||||||
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
|
|
||||||
TarArchiveEntry entry = input.getNextTarEntry();
|
|
||||||
BufferedReader br = null;
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
int rowNum = 0;
|
|
||||||
int entryNum = 0;
|
|
||||||
int modified = 0;
|
|
||||||
while (entry != null) {
|
|
||||||
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
|
|
||||||
String line;
|
|
||||||
while ((line = br.readLine()) != null) {
|
|
||||||
String[] values = line.toString().split(",");
|
|
||||||
List<String> recordInfo = Arrays.asList(values);
|
|
||||||
String orcidId = recordInfo.get(0);
|
|
||||||
if (downloader.isModified(orcidId, recordInfo.get(3))) {
|
|
||||||
slowedDownDownload(orcidId);
|
|
||||||
modified++;
|
|
||||||
}
|
|
||||||
rowNum++;
|
|
||||||
if (modified > toDownload) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entryNum++;
|
|
||||||
entry = input.getNextTarEntry();
|
|
||||||
}
|
|
||||||
long end = System.currentTimeMillis();
|
|
||||||
logToFile("start test: " + new Date(start).toString());
|
|
||||||
logToFile("end test: " + new Date(end).toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
private void downloadTest(String orcid) throws Exception {
|
|
||||||
String record = testDownloadRecord(orcid);
|
|
||||||
String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
|
|
||||||
File f = new File(filename);
|
File f = new File(filename);
|
||||||
OutputStream outStream = new FileOutputStream(f);
|
OutputStream outStream = new FileOutputStream(f);
|
||||||
IOUtils.write(record.getBytes(), outStream);
|
IOUtils.write(record.getBytes(), outStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String testDownloadRecord(String orcidId) throws Exception {
|
private String testDownloadRecord(String orcidId, String dataType) throws Exception {
|
||||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/" + dataType);
|
||||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||||
httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
|
httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
|
||||||
logToFile("start connection: " + new Date(System.currentTimeMillis()).toString());
|
long start = System.currentTimeMillis();
|
||||||
CloseableHttpResponse response = client.execute(httpGet);
|
CloseableHttpResponse response = client.execute(httpGet);
|
||||||
logToFile("end connection: " + new Date(System.currentTimeMillis()).toString());
|
long end = System.currentTimeMillis();
|
||||||
if (response.getStatusLine().getStatusCode() != 200) {
|
if (response.getStatusLine().getStatusCode() != 200) {
|
||||||
System.out
|
logToFile(
|
||||||
.println("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
testPath, "Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||||
}
|
}
|
||||||
|
logToFile(testPath, orcidId + " " + dataType + " " + (end - start) / 1000 + " seconds");
|
||||||
return IOUtils.toString(response.getEntity().getContent());
|
return IOUtils.toString(response.getEntity().getContent());
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
@ -129,7 +103,7 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
String[] values = line.split(",");
|
String[] values = line.split(",");
|
||||||
List<String> recordInfo = Arrays.asList(values);
|
List<String> recordInfo = Arrays.asList(values);
|
||||||
testDownloadRecord(recordInfo.get(0));
|
testDownloadRecord(recordInfo.get(0), REQUEST_TYPE_RECORD);
|
||||||
long endReq = System.currentTimeMillis();
|
long endReq = System.currentTimeMillis();
|
||||||
nReqTmp++;
|
nReqTmp++;
|
||||||
if (nReqTmp == REQ_LIMIT) {
|
if (nReqTmp == REQ_LIMIT) {
|
||||||
|
@ -189,20 +163,24 @@ public class OrcidClientTest {
|
||||||
final String base64CompressedRecord = IOUtils
|
final String base64CompressedRecord = IOUtils
|
||||||
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
|
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
|
||||||
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
||||||
logToFile("\n\ndownloaded \n\n" + recordFromSeqFile);
|
logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile);
|
||||||
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161");
|
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161", REQUEST_TYPE_RECORD);
|
||||||
assertTrue(recordFromSeqFile.equals(downloadedRecord));
|
assertTrue(recordFromSeqFile.equals(downloadedRecord));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
private void lambdaFileReaderTest() throws Exception {
|
@Disabled
|
||||||
|
public void lambdaFileReaderTest() throws Exception {
|
||||||
|
String last_update = "2021-01-12 00:00:06.685137";
|
||||||
TarArchiveInputStream input = new TarArchiveInputStream(
|
TarArchiveInputStream input = new TarArchiveInputStream(
|
||||||
new GzipCompressorInputStream(new FileInputStream("/develop/last_modified.csv.tar")));
|
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
|
||||||
TarArchiveEntry entry = input.getNextTarEntry();
|
TarArchiveEntry entry = input.getNextTarEntry();
|
||||||
BufferedReader br = null;
|
BufferedReader br = null;
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
int rowNum = 0;
|
int rowNum = 1;
|
||||||
|
int modifiedNum = 1;
|
||||||
int entryNum = 0;
|
int entryNum = 0;
|
||||||
|
boolean firstNotModifiedFound = false;
|
||||||
while (entry != null) {
|
while (entry != null) {
|
||||||
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
|
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
|
||||||
String line;
|
String line;
|
||||||
|
@ -210,59 +188,44 @@ public class OrcidClientTest {
|
||||||
String[] values = line.toString().split(",");
|
String[] values = line.toString().split(",");
|
||||||
List<String> recordInfo = Arrays.asList(values);
|
List<String> recordInfo = Arrays.asList(values);
|
||||||
assertTrue(recordInfo.size() == 4);
|
assertTrue(recordInfo.size() == 4);
|
||||||
|
String orcid = recordInfo.get(0);
|
||||||
|
String modifiedDate = recordInfo.get(3);
|
||||||
rowNum++;
|
rowNum++;
|
||||||
if (rowNum == 1) {
|
if (rowNum == 2) {
|
||||||
assertTrue(recordInfo.get(3).equals("last_modified"));
|
assertTrue(recordInfo.get(3).equals("last_modified"));
|
||||||
} else if (rowNum == 2) {
|
} else {
|
||||||
assertTrue(recordInfo.get(0).equals("0000-0002-0499-7333"));
|
// SparkDownloadOrcidAuthors.lastUpdate = last_update;
|
||||||
|
// boolean isModified = SparkDownloadOrcidAuthors.isModified(orcid, modifiedDate);
|
||||||
|
// if (isModified) {
|
||||||
|
// modifiedNum++;
|
||||||
|
// } else {
|
||||||
|
// if (!firstNotModifiedFound) {
|
||||||
|
// firstNotModifiedFound = true;
|
||||||
|
// logToFile(orcid + " - " + modifiedDate + " > " + isModified);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
entryNum++;
|
entryNum++;
|
||||||
assertTrue(entryNum == 1);
|
assertTrue(entryNum == 1);
|
||||||
entry = input.getNextTarEntry();
|
entry = input.getNextTarEntry();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
logToFile(testPath, "modifiedNum : " + modifiedNum + " / " + rowNum);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
public static void logToFile(Path basePath, String log) throws IOException {
|
||||||
private void lambdaFileCounterTest() throws Exception {
|
|
||||||
final String lastUpdate = "2020-09-29 00:00:00";
|
|
||||||
OrcidDownloader downloader = new OrcidDownloader();
|
|
||||||
TarArchiveInputStream input = new TarArchiveInputStream(
|
|
||||||
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
|
|
||||||
TarArchiveEntry entry = input.getNextTarEntry();
|
|
||||||
BufferedReader br = null;
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
int rowNum = 0;
|
|
||||||
int entryNum = 0;
|
|
||||||
int modified = 0;
|
|
||||||
while (entry != null) {
|
|
||||||
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
|
|
||||||
String line;
|
|
||||||
while ((line = br.readLine()) != null) {
|
|
||||||
String[] values = line.toString().split(",");
|
|
||||||
List<String> recordInfo = Arrays.asList(values);
|
|
||||||
String orcidId = recordInfo.get(0);
|
|
||||||
if (downloader.isModified(orcidId, recordInfo.get(3))) {
|
|
||||||
modified++;
|
|
||||||
}
|
|
||||||
rowNum++;
|
|
||||||
}
|
|
||||||
entryNum++;
|
|
||||||
entry = input.getNextTarEntry();
|
|
||||||
}
|
|
||||||
logToFile("rowNum: " + rowNum);
|
|
||||||
logToFile("modified: " + modified);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void logToFile(String log)
|
|
||||||
throws IOException {
|
|
||||||
log = log.concat("\n");
|
log = log.concat("\n");
|
||||||
Path path = Paths.get("/tmp/orcid_log.txt");
|
Path path = basePath.resolve("orcid_log.txt");
|
||||||
|
if (!Files.exists(path)) {
|
||||||
|
Files.createFile(path);
|
||||||
|
}
|
||||||
Files.write(path, log.getBytes(), StandardOpenOption.APPEND);
|
Files.write(path, log.getBytes(), StandardOpenOption.APPEND);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@Disabled
|
||||||
private void slowedDownDownloadTest() throws Exception {
|
private void slowedDownDownloadTest() throws Exception {
|
||||||
String orcid = "0000-0001-5496-1243";
|
String orcid = "0000-0001-5496-1243";
|
||||||
String record = slowedDownDownload(orcid);
|
String record = slowedDownDownload(orcid);
|
||||||
|
@ -281,16 +244,17 @@ public class OrcidClientTest {
|
||||||
CloseableHttpResponse response = client.execute(httpGet);
|
CloseableHttpResponse response = client.execute(httpGet);
|
||||||
long endReq = System.currentTimeMillis();
|
long endReq = System.currentTimeMillis();
|
||||||
long reqSessionDuration = endReq - start;
|
long reqSessionDuration = endReq - start;
|
||||||
logToFile("req time (millisec): " + reqSessionDuration);
|
logToFile(testPath, "req time (millisec): " + reqSessionDuration);
|
||||||
if (reqSessionDuration < 1000) {
|
if (reqSessionDuration < 1000) {
|
||||||
logToFile("wait ....");
|
logToFile(testPath, "wait ....");
|
||||||
Thread.sleep(1000 - reqSessionDuration);
|
Thread.sleep(1000 - reqSessionDuration);
|
||||||
}
|
}
|
||||||
long end = System.currentTimeMillis();
|
long end = System.currentTimeMillis();
|
||||||
long total = end - start;
|
long total = end - start;
|
||||||
logToFile("total time (millisec): " + total);
|
logToFile(testPath, "total time (millisec): " + total);
|
||||||
if (response.getStatusLine().getStatusCode() != 200) {
|
if (response.getStatusLine().getStatusCode() != 200) {
|
||||||
logToFile("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
logToFile(
|
||||||
|
testPath, "Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||||
}
|
}
|
||||||
return IOUtils.toString(response.getEntity().getContent());
|
return IOUtils.toString(response.getEntity().getContent());
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
|
@ -298,4 +262,89 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
return new String("");
|
return new String("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void downloadWorkTest() throws Exception {
|
||||||
|
String orcid = "0000-0003-0015-1952";
|
||||||
|
String record = testDownloadRecord(orcid, REQUEST_TYPE_WORK);
|
||||||
|
String filename = "/tmp/downloaded_work_".concat(orcid).concat(".xml");
|
||||||
|
File f = new File(filename);
|
||||||
|
OutputStream outStream = new FileOutputStream(f);
|
||||||
|
IOUtils.write(record.getBytes(), outStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void downloadRecordTest() throws Exception {
|
||||||
|
String orcid = "0000-0001-5004-5918";
|
||||||
|
String record = testDownloadRecord(orcid, REQUEST_TYPE_RECORD);
|
||||||
|
String filename = "/tmp/downloaded_record_".concat(orcid).concat(".xml");
|
||||||
|
File f = new File(filename);
|
||||||
|
OutputStream outStream = new FileOutputStream(f);
|
||||||
|
IOUtils.write(record.getBytes(), outStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void downloadWorksTest() throws Exception {
|
||||||
|
String orcid = "0000-0001-5004-5918";
|
||||||
|
String record = testDownloadRecord(orcid, REQUEST_TYPE_WORKS);
|
||||||
|
String filename = "/tmp/downloaded_works_".concat(orcid).concat(".xml");
|
||||||
|
File f = new File(filename);
|
||||||
|
OutputStream outStream = new FileOutputStream(f);
|
||||||
|
IOUtils.write(record.getBytes(), outStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void downloadSingleWorkTest() throws Exception {
|
||||||
|
String orcid = "0000-0001-5004-5918";
|
||||||
|
String record = testDownloadRecord(orcid, REQUEST_TYPE_WORK);
|
||||||
|
String filename = "/tmp/downloaded_work_47652866_".concat(orcid).concat(".xml");
|
||||||
|
File f = new File(filename);
|
||||||
|
OutputStream outStream = new FileOutputStream(f);
|
||||||
|
IOUtils.write(record.getBytes(), outStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void cleanAuthorListTest() throws Exception {
|
||||||
|
AuthorData a1 = new AuthorData();
|
||||||
|
a1.setOid("1");
|
||||||
|
a1.setName("n1");
|
||||||
|
a1.setSurname("s1");
|
||||||
|
a1.setCreditName("c1");
|
||||||
|
AuthorData a2 = new AuthorData();
|
||||||
|
a2.setOid("1");
|
||||||
|
a2.setName("n1");
|
||||||
|
a2.setSurname("s1");
|
||||||
|
a2.setCreditName("c1");
|
||||||
|
AuthorData a3 = new AuthorData();
|
||||||
|
a3.setOid("3");
|
||||||
|
a3.setName("n3");
|
||||||
|
a3.setSurname("s3");
|
||||||
|
a3.setCreditName("c3");
|
||||||
|
List<AuthorData> list = Lists.newArrayList();
|
||||||
|
list.add(a1);
|
||||||
|
list.add(a2);
|
||||||
|
list.add(a3);
|
||||||
|
|
||||||
|
Set<String> namesAlreadySeen = new HashSet<>();
|
||||||
|
assertTrue(list.size() == 3);
|
||||||
|
list.removeIf(a -> !namesAlreadySeen.add(a.getOid()));
|
||||||
|
assertTrue(list.size() == 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore
|
||||||
|
public void testUpdatedRecord() throws Exception {
|
||||||
|
final String base64CompressedRecord = IOUtils
|
||||||
|
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
|
||||||
|
final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
||||||
|
logToFile(testPath, "\n\nrecord updated \n\n" + record);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore
|
||||||
|
private void testUpdatedWork() throws Exception {
|
||||||
|
final String base64CompressedWork = "H4sIAAAAAAAAAM1XS2/jNhC+51cQOuxJsiXZSR03Vmq0G6Bo013E6R56oyXaZiOJWpKy4y783zvUg5Ksh5uiCJogisX5Zjj85sHx3f1rFKI94YKyeGE4I9tAJPZZQOPtwvj9+cGaGUhIHAc4ZDFZGEcijHvv6u7A+MtcPVCSSgsUQObYzuzaccBEguVuYYxt+LHgbwKP6a11M3WnY6UzrpB7KuiahlQeF0aSrkPqGwhcisWcxpLwGIcLYydlMh+PD4fDiHGfBvDcjmMxLhGlBglSH8vsIH0qGlLqBFRIGvvDWjWQ1iMJJ2CKBANqGlNqMbkj3IpxRPq1KkypFZFoDRHa0aRfq8JoNjhnfIAJJS6xPouiIQJyeYmGQzE+cO5cXqITcItBlKyASExD0a93jiwtvJDjYXDDAqBPHoH2wMmVWGNf8xyyaEBiSTeUDHHWBpd2Nmmc10yfbgHQrHCyIRxKjQwRUoFKPRwEnIgBnQJQVdGeQgJaCRN0OMnPkaUFVbD9WkpaIndQJowf+8EFoIpTErJjBFQOBavElFpfUxwC9ZcqvQErdQXhe+oPFF8BaObupYzVsYEOARzSoZBWmKqaBMHcV0Wf8oG0beIqD+Gdkz0lhyE3NajUW6fhQFSV9Nw/MCBYyofYa0EN7wrBz13eP+Y+J6obWgE8Pdd2JpYD94P77Ezmjj13b0bu5PqPu3EXumEnxEJaEVxSUIHammsra+53z44zt2/m1/bItaeVtQ6dhs3c4XytvW75IYUchMKvEHVUyqmnWBFAS0VJrqSvQde6vp251ux2NtFuKcVOi+oK9YY0M0Cn6o4J6WkvtEK2XJ1vfPGAZxSoK8lb+SxJBbLQx1CohOLndjJUywQWUFmqEi3G6Zaqf/7buOyYJd5IYpfmf0XipfP18pDR9cQCeEuJQI/Lx36bFbVnpBeL2UwmqQw7ApAvf4GeGGQdEbENgolui/wdpjHaYCmPCIPPAmGBIsxfoLUhyRCB0SeCakEBJRKBtfJ+UBbI15TG4PaGBAhWthx8DmFYtHZQujv1CWbLLdzmmUKmHEOWCe1/zdu78bn/+YH+hCOqOzcXfFwuP6OVT/P710crwqGXFrpNaM2GT3MXarw01i15TIi3pmtJXgtbTVGf3h6HKfF+wBAnPyTfdCChudlm5gZaoG//F9pPZsGQcqqbyZN5hBau5OoIJ3PPwjTKDuG4s5MZp2rMzF5PZoK34IT6PIFOPrk+mTiVO5aJH2C+JJRjE/06eoRfpJxa4VgyYaLlaJUv/EhCfATMU/76gEOfmehL/qbJNNHjaFna+CQYB8wvo9PpPFJ5MOrJ1Ix7USBZqBl7KRNOx1d3jex7SG6zuijqCMWRusBsncjZSrM2u82UJmqzpGhvUJN2t6caIM9QQgO9c0t40UROnWsJd2Rbs+nsxpna9u30ttNkjechmzHjEST+X5CkkuNY0GzQkzyFseAf7lSZuLwdh1xSXKvvQJ4g4abTYgPV7uMt3rskohlJmMa82kQkshtyBEIYqQ+YB8X3oRHg7iFKi/bZP+Ao+T6BJhIT/vNPi8ffZs+flk+r2v0WNroZiyWn6xRmadHqTJXsjLJczElAZX6TnJdoWTM1SI2gfutv3rjeBt5t06rVvNuWup29246tlvluO+u2/G92bK9DXheL6uFd/Q3EaRDZqBIAAA==";
|
||||||
|
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
||||||
|
logToFile(testPath, "\n\nwork updated \n\n" + work);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,20 +1,44 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid.xml;
|
package eu.dnetlib.doiboost.orcid.xml;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.Work;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
|
import eu.dnetlib.doiboost.orcid.OrcidClientTest;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
|
||||||
public class XMLRecordParserTest {
|
public class XMLRecordParserTest {
|
||||||
|
private static final String NS_WORK = "work";
|
||||||
|
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
|
||||||
|
private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
|
||||||
|
private static final String NS_COMMON = "common";
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private static Path testPath;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
private static void setUp() throws IOException {
|
||||||
|
testPath = Files.createTempDirectory(XMLRecordParserTest.class.getName());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
private void testOrcidAuthorDataXMLParser() throws Exception {
|
public void testOrcidAuthorDataXMLParser() throws Exception {
|
||||||
|
|
||||||
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
|
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
|
||||||
|
|
||||||
|
@ -26,10 +50,11 @@ public class XMLRecordParserTest {
|
||||||
System.out.println("name: " + authorData.getName());
|
System.out.println("name: " + authorData.getName());
|
||||||
assertNotNull(authorData.getSurname());
|
assertNotNull(authorData.getSurname());
|
||||||
System.out.println("surname: " + authorData.getSurname());
|
System.out.println("surname: " + authorData.getSurname());
|
||||||
|
OrcidClientTest.logToFile(testPath, OBJECT_MAPPER.writeValueAsString(authorData));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
private void testOrcidXMLErrorRecordParser() throws Exception {
|
public void testOrcidXMLErrorRecordParser() throws Exception {
|
||||||
|
|
||||||
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
|
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
|
||||||
|
|
||||||
|
@ -42,7 +67,7 @@ public class XMLRecordParserTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
private void testOrcidWorkDataXMLParser() throws Exception {
|
public void testOrcidWorkDataXMLParser() throws Exception {
|
||||||
|
|
||||||
String xml = IOUtils
|
String xml = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
|
@ -54,8 +79,7 @@ public class XMLRecordParserTest {
|
||||||
assertNotNull(workData);
|
assertNotNull(workData);
|
||||||
assertNotNull(workData.getOid());
|
assertNotNull(workData.getOid());
|
||||||
System.out.println("oid: " + workData.getOid());
|
System.out.println("oid: " + workData.getOid());
|
||||||
assertNotNull(workData.getDoi());
|
assertNull(workData.getDoi());
|
||||||
System.out.println("doi: " + workData.getDoi());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -64,9 +88,6 @@ public class XMLRecordParserTest {
|
||||||
String xml = IOUtils
|
String xml = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
this.getClass().getResourceAsStream("summary_0000-0001-5109-1000_othername.xml"));
|
this.getClass().getResourceAsStream("summary_0000-0001-5109-1000_othername.xml"));
|
||||||
|
|
||||||
XMLRecordParser p = new XMLRecordParser();
|
|
||||||
|
|
||||||
AuthorData authorData = XMLRecordParser.VTDParseAuthorData(xml.getBytes());
|
AuthorData authorData = XMLRecordParser.VTDParseAuthorData(xml.getBytes());
|
||||||
assertNotNull(authorData);
|
assertNotNull(authorData);
|
||||||
assertNotNull(authorData.getOtherNames());
|
assertNotNull(authorData.getOtherNames());
|
||||||
|
@ -74,4 +95,43 @@ public class XMLRecordParserTest {
|
||||||
String jsonData = JsonWriter.create(authorData);
|
String jsonData = JsonWriter.create(authorData);
|
||||||
assertNotNull(jsonData);
|
assertNotNull(jsonData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// @Test
|
||||||
|
// private void testWorkIdLastModifiedDateXMLParser() throws Exception {
|
||||||
|
// String xml = IOUtils
|
||||||
|
// .toString(
|
||||||
|
// this.getClass().getResourceAsStream("record_0000-0001-5004-5918.xml"));
|
||||||
|
// Map<String, String> workIdLastModifiedDate = XMLRecordParser.retrieveWorkIdLastModifiedDate(xml.getBytes());
|
||||||
|
// workIdLastModifiedDate.forEach((k, v) -> {
|
||||||
|
// try {
|
||||||
|
// OrcidClientTest
|
||||||
|
// .logToFile(
|
||||||
|
// k + " " + v + " isModified after " + SparkDownloadOrcidWorks.lastUpdateValue + ": "
|
||||||
|
// + SparkDownloadOrcidWorks.isModified("0000-0001-5004-5918", v));
|
||||||
|
// } catch (IOException e) {
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
// }
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAuthorSummaryXMLParser() throws Exception {
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
this.getClass().getResourceAsStream("record_0000-0001-5004-5918.xml"));
|
||||||
|
AuthorSummary authorSummary = XMLRecordParser.VTDParseAuthorSummary(xml.getBytes());
|
||||||
|
authorSummary.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
|
||||||
|
OrcidClientTest.logToFile(testPath, JsonWriter.create(authorSummary));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWorkDataXMLParser() throws Exception {
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
|
||||||
|
WorkDetail workDetail = XMLRecordParserNoDoi.VTDParseWorkData(xml.getBytes());
|
||||||
|
Work work = new Work();
|
||||||
|
work.setWorkDetail(workDetail);
|
||||||
|
work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
|
||||||
|
OrcidClientTest.logToFile(testPath, JsonWriter.create(work));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,8 +21,8 @@ import com.ximpleware.XPathParseException;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
import eu.dnetlib.dhp.schema.orcid.Contributor;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
|
|
||||||
public class OrcidNoDoiTest {
|
public class OrcidNoDoiTest {
|
||||||
|
@ -48,7 +48,7 @@ public class OrcidNoDoiTest {
|
||||||
if (p == null) {
|
if (p == null) {
|
||||||
logger.info("XMLRecordParserNoDoi null");
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
}
|
}
|
||||||
WorkDataNoDoi workData = null;
|
WorkDetail workData = null;
|
||||||
try {
|
try {
|
||||||
workData = p.VTDParseWorkData(xml.getBytes());
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -105,7 +105,7 @@ public class OrcidNoDoiTest {
|
||||||
if (p == null) {
|
if (p == null) {
|
||||||
logger.info("XMLRecordParserNoDoi null");
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
}
|
}
|
||||||
WorkDataNoDoi workData = null;
|
WorkDetail workData = null;
|
||||||
try {
|
try {
|
||||||
workData = p.VTDParseWorkData(xml.getBytes());
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -136,7 +136,7 @@ public class OrcidNoDoiTest {
|
||||||
if (p == null) {
|
if (p == null) {
|
||||||
logger.info("XMLRecordParserNoDoi null");
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
}
|
}
|
||||||
WorkDataNoDoi workData = null;
|
WorkDetail workData = null;
|
||||||
try {
|
try {
|
||||||
workData = p.VTDParseWorkData(xml.getBytes());
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -179,7 +179,7 @@ public class OrcidNoDoiTest {
|
||||||
if (p == null) {
|
if (p == null) {
|
||||||
logger.info("XMLRecordParserNoDoi null");
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
}
|
}
|
||||||
WorkDataNoDoi workData = null;
|
WorkDetail workData = null;
|
||||||
try {
|
try {
|
||||||
workData = p.VTDParseWorkData(xml.getBytes());
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -308,7 +308,7 @@ public class OrcidNoDoiTest {
|
||||||
if (p == null) {
|
if (p == null) {
|
||||||
logger.info("XMLRecordParserNoDoi null");
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
}
|
}
|
||||||
WorkDataNoDoi workData = null;
|
WorkDetail workData = null;
|
||||||
try {
|
try {
|
||||||
workData = p.VTDParseWorkData(xml.getBytes());
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
H4sIAAAAAAAAAO1c63LbNhb+n6fA6EebTE2JulpyYnXVpE2a1Jus7V5mO/0BkZCImCJVgLSidjqzf/cJ9oH2TfZJ9jsASVESLWdsddNulJlcDJxzcO4XEJMnn7+bhexaKC3j6LTWrLs1JiIv9mU0Pa19e/mV068xnfDI52EcidPaUuja58MHTxaxujqhP9g8TRwgYK/Xb/Z7TbdZY3OeBKe1hotfDn63nF6v13GOO91mg3AaK8hrqeVYhjJZntbm6TiUXo2BpUifyCgRKuLhaS1IkvlJo7FYLOqx8qSPP6eNSDdyiBxD+KnHEyPITSgFSI7jS53IyNuNVQIq8MRcCZAS/g60AibHipNAKCfiM3Ez1gomx5qJ2RgWCuT8ZqwVTKENpWK1QxO0ncN68Wy2SwF2P4eGULHaIbfdz6HnYCuGlRxfJFyG+ma8TcicwpVYLnYemAEUks+AvUNy2i5g31kfcqQvokROpNils23gnM4kjWzM3ISbARRaUWIiFEJN7FLICijH476vhN6BkwGsouhawgGdeazlbiffhMwpUMDejEW7OWSAMInV8mbgDGBlp3kYL2dQ5S5j5TA51s8pD6H62yJ9DSzH1UJdS29H8GUA6757m8cWtkGGgA7lLpOuYFbRpAVXHgV9qna47TrcikP8rMS1FItdbBZAOd44DXdYlXY3+QMBHadql/a2QGvDBwy/ntj8ceIpQdnQ8fHnsOW2UByaTtu9bLVOOv2TJqpPx/37k0YV9BqdkOvEmaFIIQLL1Jqu02pdus0T1z1xe/VOu7+iVoGzRtMybNe21x0vlPBBBP4KogyVKjkkrWioZaUSi9QYvXnjdH948bfLL1vtN98evx5dXA4KvgizkiTV0OFOVANRiRvEOhkWfBQIZnklYeNWETeUQEVp+ApZ7FPNnsZhKKaCfRNHfhxt0jKQDypOyRZN+5DIJKzQuF2+iD3JQ/aF4jJiX6W2+mLhjCepMkHNsPFXsRjHKmJfRxMeJZp9L5OAoVsx/4jThHH2FZ/JcMle2NzD4gkbpYnUM3YxF16i0hl7JjWqh1AFqyXGnjQ2WbW8v4U0VAnsxsvR2Qi8JKYhiuciytDWoUroOohVgjqnPSXnJMzwkzB5PP9kmjz+ejbHHkfSP2HfBzxhUkNShD1lZxYrxr2fU6nwb8gfiVSh97oWYTynJAkFeTCISeCa6dSDNjTjVmCdC+xnArOHo4tnj+iAKCZVTeQ7OiJNoAdxxMbQn4x0IrhPMJxdp2EkFLf9GktiLBU0odcEtkr0ERO0CONB69paEVGHVJyGlPfq7GtbPZdwJIZmh41lHMZTpOqQzYQX8AjM4jhtkEnoBVl1/XAljBI0C+P4ighBTOQeHAmtIPELWkApQ3cZkihiEithTzMeBXl0wOcgPl4SXBLxZOP8yEcoGxTxDolemjpMcobI4DjRcIVtLTLJ62wUyRmo6CT1ISn0P50KnQAIZtSp9gRsvdJehfFyy+B4JTVILAIRsamIRCK9nCWBSq3iKEMB3JVmE8sqeCnZn4foV6gZp7bFsK6XkRcAN051poisIBm9kawkqdUF/Sv2rRskKN0sgEojsKugTnAl3iGyIuuHQTrj5I0I0QQmJmduGG8u3Pr1+K2go+DVlzEZF00KSUfdrmU0slENLiercJ+twp3Yt+5kOfek8lKo3fjmhrPAl23YB6Wwv3hmQ8akjEomnwktp9ERuxAJGv7pkUklb7iC8uWcEswJMo1VhhdTCBtTG+rtXiF+xkJkebFZqJKdoxUKukOhFrAoJJ5aa1MRjSgPMDjV1Ph4wi4SdhnEM1jiRaznkuwEmWwSPmJfRtMQ5x6xVBt45gtfmgkkO6lQXk5SLxHfMxg0WZBNX6aRYK32EWu5za4Vf5ROU/hw06z160hza1IiaShNqWyqhADPIScj203S+MPzzx4ZOmRoG4V5JIfC5BBKTiSvDSIDu6bJSgU+PHcesQUo4khPpSY3ZjFgbVJnFyVfp1CD7GVnt3pQYmpCJZTRFUiAn8zHch9kC07Gns05Um6Vz5wRmdc2Z1ruzwTXKax3ws4z6vhhjr8pFxkut84gQbQIESG5Bxetv82zZjbWAXZnGI4cjthYaqlzzbKQ0shmhBfiEkVwKbgXZBIbsVINelQfQNSwbLJb7JVYswUlEiXF8YwEtuCJMSUn2slZqrPnKk7nJudnw8sR0UgUOgZyOaMA8Q7ehfYBLj2WKgmKn7THI+t4U0Pm3/8yO2bW54YlkDP6yvNPlVHOhUa1gQUuoZuJJF7R8qFciYR4AZummE5Ys8/OPwN12z48bLYRf6F4DIX4EhntR8WjqfjJVAjkW41SR25UZrXTqg/a7MeOW3ddp9Op93s/gT9xpa3b0wHOfQ/ouuzH9qDeGtAB3X5+QDkYg9hqBdIEqNeUx8z4EyUmaqaUZo2TbNWBzQqgAJwYhqgAKLiClrDZjD1M/vOPf57id6ve6T9mb7Kf0LVbUUMxAR4Kl7B9CKVNsFagteuD3jpandIpJlZTr45sijCeycsC3OgJuV8T1zzK2NViSpXRNCQmMCami0lDXubEbVcI4ME9AZeIEvNWGzn1E1Yi4ZZJgJ45ahuyVe83NyA3VFyGPT6uoloJ2u2ugVptrrz56DZ7+4JGLMoBMRX19oBSTadrnevTbZc8onpNGNXkstNklFOFZUqub84w6RmzQdZcVIXu0zjywlTbBgZGOUdavLbt8EWl1+q8GfSZj2kKGWa9aVilMkRClsxMQTTtOvLVJdVzW8gncWoSKrXdRatguxvoM+DXtqzeUvOMB290JFshuDvPkuT+Uq9LYlx/JYG6obrMVQzXNR2APdWx3X5WdWAQRLMhWtJ/NrFsDyalqcVDv7Fa2153kuVcDMdynIh3Gb31rZvwrnmYiuFfTKMVil87/nG33ez1B72+3/EHYtxqdwb+2D9u9pu+N3aPQMeMVIbWKat9gGGxRkzwMaIDnmiYOAxuh8Htzz64/fGmtMNIdhjJdo5kh/nrQ89fh2HrMGwdhq0//rB1mKz+h5OVnQ9S1EqVDSkv0Vsm7KnkSqF6c8PIS8ooaFzZ60/PoGgvQCuccJC2BuIhYhIjx0wie19blGd8gj6XfUGdQyjM0jeph940Zk8NN7HzHHnOt1ujCBxES/ZGIcLMypczMPwiBffWCy4SIaOFQGf168sYrERYfxXyVP+WcUhrnL1C6uQ6o0Bl/41QympztBRoydlLfk3lDAvfhdwHz4qDeIwKFIiM93MevYUORldxKK64sudTqQ7Yd9JLYpUdqcU8YC/4WzKekVl4aKLYWarmwTLTwrUEJ/6CK99ydYlaeCXZCIIG0qw8p3YCzdOZNwqpbTMmWULDLJ8b0T4NzOoM9THIVvlc0ZIfS1YANt1603Wbjbcc/mrdmz7z1YlAvdnv9Q0V8DhNKW0SCjV+6BjMxnUcpjORH2qWsk+DmWtsfj80IFLraMVq97jjtPtu12zl7YiirREsSrkbjY9vhrFRFiH08oGgo5QeB2WEOlj6bXM6twN4+Yvn+qyffbClGT7/ppkN6/kH0mK8L75fm9dclvzqc3sZgkwxJA0WH17NyhacMc7Q7RRgdmELzufLodstoOjH9U/Q1Szl6KXXPXqbeGm3+pt7CcBedmSfwkk9WCuY2IK7lZo1Tn4p4tCtiEPXIg7dizjli5HKQ0q23XVKRKkrlL9Qy438oaV5l4N6JGp3P3tF9HYGbLZHug3kfIhmfFJJcQ1q+y1DpZnubsP5bA+Wa7uDbrPZ6/xe1tlJ/89uAbEHA7Qc3aq7Tr/r9jrtVrvd7f5epnjPk/7sRkFtvLdRbi2pv5eN7nbwhzdZ1Y5eL2GpCotnaFdeOEdrVcffde7V06uGuZ4OGyJqlAqhbtjm1TGXL86qa3ZWHbKDjaxjd7IJw6HW20GX5WT3QQ537H2Qk90HOfHEsffXTn7X7OS3pA/fp6A8qgfJLCw9lAvXvkXQjYYcpziqXK0396qNVQJwzDO5dbB1ldqXfWsP+/KH7U3neNBpOt1W2y3xKW+mZp7s7cKueNPXeD+mM9ExrMnEvr/bHDjO4uiXOH+aVgasolM6jCf2n0JXCLYFrdDbD+3gkx+1ubsh33sduA32wazecvpuu+30Bt0dzzhvtHoV9l6tftNIeTD8/Q3fG7htRO3gLuFehb1Pw2/eFhzsfV97t52WOzh2BseDH+5g7yrsfdp7/SLoI7T2lsDV92AHzYjh2jXgQSFiWLoF/QjVsfe62G73eo47aLfuVBe3sffaELFXxSX3R2jrigaxfKN/0Aglg+KDxkeojr3PxL1O59jptbp3aZqqsPeZDMrfqj5CW28JXPWp7qAXGqbWvlR+hCrZe4/QbTc7znGv1btTj7CNvc+0sPYR+mDs+xu71Ru4Trcz6N7J2NvY+70hK70vOBh7D7di+f/ucrdbsS3svd6S2Kcjz7PHIwdz3/9SrNOnTxdu7y6JvAp7r/1ddtGx9j7oYPQ9TPjdrus00ZzfbcLfwt6n0deefh2MfX9jdzq9ntNqd9p3MvY29j6Nvfmq7//M3tvrG9/480eG5j9dG4rVf72yvvEgI0R/DB/8F4+Tql7oTQAA
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -732,7 +732,7 @@
|
||||||
<common:external-id-relationship>part-of</common:external-id-relationship>
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
</common:external-id>
|
</common:external-id>
|
||||||
</common:external-ids>
|
</common:external-ids>
|
||||||
<work:work-summary put-code="0" visibility="private">
|
<work:work-summary put-code="123456" visibility="private">
|
||||||
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
<common:source>
|
<common:source>
|
||||||
|
|
Loading…
Reference in New Issue