[ORCID-no-doi] integrating PR#98 #98
parent
70e49ed53c
commit
ee34cc51c3
@ -0,0 +1,31 @@
|
||||
diff a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java (rejected hunks)
|
||||
@@ -1,8 +1,6 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
-import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
-
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
|
||||
import java.text.ParseException;
|
||||
@@ -10,6 +8,8 @@ import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
+import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
+
|
||||
/**
|
||||
* Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to
|
||||
* graph node identifiers and it is further characterised by the semantic of the link through the fields relType,
|
||||
@@ -137,7 +137,10 @@ public class Relation extends Oaf {
|
||||
try {
|
||||
setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate()));
|
||||
} catch (ParseException e) {
|
||||
- throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate()));
|
||||
+ throw new IllegalArgumentException(String
|
||||
+ .format(
|
||||
+ "invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(),
|
||||
+ getValidationDate()));
|
||||
}
|
||||
|
||||
super.mergeFrom(r);
|
@ -0,0 +1,79 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class AuthorHistory implements Serializable {
|
||||
private String creationMethod;
|
||||
private String completionDate;
|
||||
private String submissionDate;
|
||||
private String lastModifiedDate;
|
||||
private boolean claimed;
|
||||
private String deactivationDate;
|
||||
private boolean verifiedEmail;
|
||||
private boolean verifiedPrimaryEmail;
|
||||
|
||||
public String getCreationMethod() {
|
||||
return creationMethod;
|
||||
}
|
||||
|
||||
public void setCreationMethod(String creationMethod) {
|
||||
this.creationMethod = creationMethod;
|
||||
}
|
||||
|
||||
public String getCompletionDate() {
|
||||
return completionDate;
|
||||
}
|
||||
|
||||
public void setCompletionDate(String completionDate) {
|
||||
this.completionDate = completionDate;
|
||||
}
|
||||
|
||||
public String getSubmissionDate() {
|
||||
return submissionDate;
|
||||
}
|
||||
|
||||
public void setSubmissionDate(String submissionDate) {
|
||||
this.submissionDate = submissionDate;
|
||||
}
|
||||
|
||||
public String getLastModifiedDate() {
|
||||
return lastModifiedDate;
|
||||
}
|
||||
|
||||
public void setLastModifiedDate(String lastModifiedDate) {
|
||||
this.lastModifiedDate = lastModifiedDate;
|
||||
}
|
||||
|
||||
public boolean isClaimed() {
|
||||
return claimed;
|
||||
}
|
||||
|
||||
public void setClaimed(boolean claimed) {
|
||||
this.claimed = claimed;
|
||||
}
|
||||
|
||||
public String getDeactivationDate() {
|
||||
return deactivationDate;
|
||||
}
|
||||
|
||||
public void setDeactivationDate(String deactivationDate) {
|
||||
this.deactivationDate = deactivationDate;
|
||||
}
|
||||
|
||||
public boolean isVerifiedEmail() {
|
||||
return verifiedEmail;
|
||||
}
|
||||
|
||||
public void setVerifiedEmail(boolean verifiedEmail) {
|
||||
this.verifiedEmail = verifiedEmail;
|
||||
}
|
||||
|
||||
public boolean isVerifiedPrimaryEmail() {
|
||||
return verifiedPrimaryEmail;
|
||||
}
|
||||
|
||||
public void setVerifiedPrimaryEmail(boolean verifiedPrimaryEmail) {
|
||||
this.verifiedPrimaryEmail = verifiedPrimaryEmail;
|
||||
}
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class AuthorSummary extends OrcidData implements Serializable {
|
||||
private AuthorData authorData;
|
||||
private AuthorHistory authorHistory;
|
||||
|
||||
public AuthorData getAuthorData() {
|
||||
return authorData;
|
||||
}
|
||||
|
||||
public void setAuthorData(AuthorData authorData) {
|
||||
this.authorData = authorData;
|
||||
}
|
||||
|
||||
public AuthorHistory getAuthorHistory() {
|
||||
return authorHistory;
|
||||
}
|
||||
|
||||
public void setAuthorHistory(AuthorHistory authorHistory) {
|
||||
this.authorHistory = authorHistory;
|
||||
}
|
||||
}
|
@ -1,11 +1,13 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* This class models the data related to external id, that are retrieved from an orcid publication
|
||||
*/
|
||||
|
||||
public class ExternalId {
|
||||
public class ExternalId implements Serializable {
|
||||
private String type;
|
||||
private String value;
|
||||
private String relationShip;
|
@ -0,0 +1,34 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class OrcidData implements Serializable {
|
||||
protected String base64CompressData;
|
||||
protected String statusCode;
|
||||
protected String downloadDate;
|
||||
|
||||
public String getBase64CompressData() {
|
||||
return base64CompressData;
|
||||
}
|
||||
|
||||
public void setBase64CompressData(String base64CompressData) {
|
||||
this.base64CompressData = base64CompressData;
|
||||
}
|
||||
|
||||
public String getStatusCode() {
|
||||
return statusCode;
|
||||
}
|
||||
|
||||
public void setStatusCode(String statusCode) {
|
||||
this.statusCode = statusCode;
|
||||
}
|
||||
|
||||
public String getDownloadDate() {
|
||||
return downloadDate;
|
||||
}
|
||||
|
||||
public void setDownloadDate(String downloadDate) {
|
||||
this.downloadDate = downloadDate;
|
||||
}
|
||||
}
|
@ -1,11 +1,13 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* This class models the data related to a publication date, that are retrieved from an orcid publication
|
||||
*/
|
||||
|
||||
public class PublicationDate {
|
||||
public class PublicationDate implements Serializable {
|
||||
private String year;
|
||||
private String month;
|
||||
private String day;
|
@ -0,0 +1,79 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Summary implements Serializable {
|
||||
private String creationMethod;
|
||||
private String completionDate;
|
||||
private String submissionDate;
|
||||
private String lastModifiedDate;
|
||||
private boolean claimed;
|
||||
private String deactivationDate;
|
||||
private boolean verifiedEmail;
|
||||
private boolean verifiedPrimaryEmail;
|
||||
|
||||
public String getCreationMethod() {
|
||||
return creationMethod;
|
||||
}
|
||||
|
||||
public void setCreationMethod(String creationMethod) {
|
||||
this.creationMethod = creationMethod;
|
||||
}
|
||||
|
||||
public String getCompletionDate() {
|
||||
return completionDate;
|
||||
}
|
||||
|
||||
public void setCompletionDate(String completionDate) {
|
||||
this.completionDate = completionDate;
|
||||
}
|
||||
|
||||
public String getSubmissionDate() {
|
||||
return submissionDate;
|
||||
}
|
||||
|
||||
public void setSubmissionDate(String submissionDate) {
|
||||
this.submissionDate = submissionDate;
|
||||
}
|
||||
|
||||
public String getLastModifiedDate() {
|
||||
return lastModifiedDate;
|
||||
}
|
||||
|
||||
public void setLastModifiedDate(String lastModifiedDate) {
|
||||
this.lastModifiedDate = lastModifiedDate;
|
||||
}
|
||||
|
||||
public boolean isClaimed() {
|
||||
return claimed;
|
||||
}
|
||||
|
||||
public void setClaimed(boolean claimed) {
|
||||
this.claimed = claimed;
|
||||
}
|
||||
|
||||
public String getDeactivationDate() {
|
||||
return deactivationDate;
|
||||
}
|
||||
|
||||
public void setDeactivationDate(String deactivationDate) {
|
||||
this.deactivationDate = deactivationDate;
|
||||
}
|
||||
|
||||
public boolean isVerifiedEmail() {
|
||||
return verifiedEmail;
|
||||
}
|
||||
|
||||
public void setVerifiedEmail(boolean verifiedEmail) {
|
||||
this.verifiedEmail = verifiedEmail;
|
||||
}
|
||||
|
||||
public boolean isVerifiedPrimaryEmail() {
|
||||
return verifiedPrimaryEmail;
|
||||
}
|
||||
|
||||
public void setVerifiedPrimaryEmail(boolean verifiedPrimaryEmail) {
|
||||
this.verifiedPrimaryEmail = verifiedPrimaryEmail;
|
||||
}
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Work extends OrcidData implements Serializable {
|
||||
WorkDetail workDetail;
|
||||
|
||||
public WorkDetail getWorkDetail() {
|
||||
return workDetail;
|
||||
}
|
||||
|
||||
public void setWorkDetail(WorkDetail workDetail) {
|
||||
this.workDetail = workDetail;
|
||||
}
|
||||
}
|
@ -1,14 +1,19 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.schema.orcid.Contributor;
|
||||
import eu.dnetlib.dhp.schema.orcid.ExternalId;
|
||||
import eu.dnetlib.dhp.schema.orcid.OrcidData;
|
||||
import eu.dnetlib.dhp.schema.orcid.PublicationDate;
|
||||
|
||||
/**
|
||||
* This class models the data that are retrieved from orcid publication
|
||||
*/
|
||||
|
||||
public class WorkDataNoDoi implements Serializable {
|
||||
public class WorkDetail implements Serializable {
|
||||
|
||||
private String oid;
|
||||
private String id;
|
@ -1,208 +0,0 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import java.io.*;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class OrcidDownloader extends OrcidDSManager {
|
||||
|
||||
static final int REQ_LIMIT = 24;
|
||||
static final int REQ_MAX_TEST = -1;
|
||||
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 500;
|
||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
static final String lastUpdate = "2020-09-29 00:00:00";
|
||||
private String lambdaFileName;
|
||||
private String outputPath;
|
||||
private String token;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
OrcidDownloader orcidDownloader = new OrcidDownloader();
|
||||
orcidDownloader.loadArgs(args);
|
||||
orcidDownloader.parseLambdaFile();
|
||||
}
|
||||
|
||||
private String downloadRecord(String orcidId) throws IOException {
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
if (response.getStatusLine().getStatusCode() != 200) {
|
||||
Log
|
||||
.info(
|
||||
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||
return new String("");
|
||||
}
|
||||
// return IOUtils.toString(response.getEntity().getContent());
|
||||
return xmlStreamToString(response.getEntity().getContent());
|
||||
}
|
||||
}
|
||||
|
||||
private String xmlStreamToString(InputStream xmlStream) throws IOException {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(xmlStream));
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
public void parseLambdaFile() throws Exception {
|
||||
int parsedRecordsCounter = 0;
|
||||
int downloadedRecordsCounter = 0;
|
||||
int savedRecordsCounter = 0;
|
||||
long startDownload = 0;
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
|
||||
Path hdfsreadpath = new Path(lambdaFileUri);
|
||||
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
||||
Path hdfsoutputPath = new Path(
|
||||
hdfsServerUri
|
||||
.concat(workingPath)
|
||||
.concat(outputPath)
|
||||
.concat("updated_xml_authors.seq"));
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||
new GzipCompressorInputStream(lambdaFileStream))) {
|
||||
TarArchiveEntry entry = null;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfsoutputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
|
||||
startDownload = System.currentTimeMillis();
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] values = line.split(",");
|
||||
List<String> recordInfo = Arrays.asList(values);
|
||||
int nReqTmp = 0;
|
||||
long startReqTmp = System.currentTimeMillis();
|
||||
// skip headers line
|
||||
if (parsedRecordsCounter == 0) {
|
||||
parsedRecordsCounter++;
|
||||
continue;
|
||||
}
|
||||
parsedRecordsCounter++;
|
||||
String orcidId = recordInfo.get(0);
|
||||
if (isModified(orcidId, recordInfo.get(3))) {
|
||||
String record = downloadRecord(orcidId);
|
||||
downloadedRecordsCounter++;
|
||||
if (!record.isEmpty()) {
|
||||
// String compressRecord = ArgumentApplicationParser.compressArgument(record);
|
||||
final Text key = new Text(recordInfo.get(0));
|
||||
final Text value = new Text(record);
|
||||
writer.append(key, value);
|
||||
savedRecordsCounter++;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
long endReq = System.currentTimeMillis();
|
||||
nReqTmp++;
|
||||
if (nReqTmp == REQ_LIMIT) {
|
||||
long reqSessionDuration = endReq - startReqTmp;
|
||||
if (reqSessionDuration <= 1000) {
|
||||
Log
|
||||
.info(
|
||||
"\nreqSessionDuration: "
|
||||
+ reqSessionDuration
|
||||
+ " nReqTmp: "
|
||||
+ nReqTmp
|
||||
+ " wait ....");
|
||||
Thread.sleep(1000 - reqSessionDuration);
|
||||
} else {
|
||||
nReqTmp = 0;
|
||||
startReqTmp = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||
Log
|
||||
.info(
|
||||
"Current parsed: "
|
||||
+ parsedRecordsCounter
|
||||
+ " downloaded: "
|
||||
+ downloadedRecordsCounter
|
||||
+ " saved: "
|
||||
+ savedRecordsCounter);
|
||||
if (REQ_MAX_TEST != -1 && parsedRecordsCounter > REQ_MAX_TEST) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
long endDownload = System.currentTimeMillis();
|
||||
long downloadTime = endDownload - startDownload;
|
||||
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
|
||||
}
|
||||
}
|
||||
}
|
||||
Log.info("Download started at: " + new Date(startDownload).toString());
|
||||
Log.info("Download ended at: " + new Date(System.currentTimeMillis()).toString());
|
||||
Log.info("Parsed Records Counter: " + parsedRecordsCounter);
|
||||
Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
|
||||
Log.info("Saved Records Counter: " + savedRecordsCounter);
|
||||
}
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
OrcidDownloader.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
workingPath = parser.get("workingPath");
|
||||
Log.info("Default Path: " + workingPath);
|
||||
lambdaFileName = parser.get("lambdaFileName");
|
||||
Log.info("Lambda File Name: " + lambdaFileName);
|
||||
outputPath = parser.get("outputPath");
|
||||
Log.info("Output Data: " + outputPath);
|
||||
token = parser.get("token");
|
||||
}
|
||||
|
||||
public boolean isModified(String orcidId, String modifiedDate) {
|
||||
Date modifiedDateDt = null;
|
||||
Date lastUpdateDt = null;
|
||||
try {
|
||||
if (modifiedDate.length() != 19) {
|
||||
modifiedDate = modifiedDate.substring(0, 19);
|
||||
}
|
||||
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
||||
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||
} catch (Exception e) {
|
||||
Log.info("[" + orcidId + "] Parsing date: ", e.getMessage());
|
||||
return true;
|
||||
}
|
||||
return modifiedDateDt.after(lastUpdateDt);
|
||||
}
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java (rejected hunks)
|
||||
@@ -31,7 +32,6 @@ public class SparkDownloadOrcidAuthors {
|
||||
|
||||
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
|
||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
- static String lastUpdate;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
@@ -54,14 +54,18 @@ public class SparkDownloadOrcidAuthors {
|
||||
final String token = parser.get("token");
|
||||
final String lambdaFileName = parser.get("lambdaFileName");
|
||||
logger.info("lambdaFileName: {}", lambdaFileName);
|
||||
-
|
||||
- lastUpdate = HDFSUtil.readFromTextFile(workingPath.concat("last_update.txt"));
|
||||
+ final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
+ String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||
+ logger.info("lastUpdate: ", lastUpdate);
|
||||
+ if (StringUtils.isBlank(lastUpdate)) {
|
||||
+ throw new RuntimeException("last update info not found");
|
||||
+ }
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
|
@ -0,0 +1,251 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
||||
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkDownloadOrcidWorks {
|
||||
|
||||
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidWorks.class);
|
||||
public static final String LAMBDA_FILE_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
public static final DateTimeFormatter LAMBDA_FILE_DATE_FORMATTER = DateTimeFormatter
|
||||
.ofPattern(LAMBDA_FILE_DATE_FORMAT);
|
||||
public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
|
||||
public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter
|
||||
.ofPattern(ORCID_XML_DATETIME_FORMAT);
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDownloadOrcidWorks.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
logger.info("workingPath: ", workingPath);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
final String token = parser.get("token");
|
||||
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||
logger.info("lastUpdateValue: ", lastUpdateValue);
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors");
|
||||
LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors");
|
||||
LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works");
|
||||
LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works");
|
||||
LongAccumulator maxModifiedWorksLimitAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("max_modified_works_limit");
|
||||
LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found");
|
||||
LongAccumulator errorLoadingJsonFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_loading_json_found");
|
||||
LongAccumulator errorLoadingXMLFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_loading_xml_found");
|
||||
LongAccumulator errorParsingXMLFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_parsing_xml_found");
|
||||
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
||||
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
|
||||
LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404");
|
||||
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
|
||||
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
|
||||
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
|
||||
LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
|
||||
|
||||
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
|
||||
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
|
||||
updatedAuthorsAcc.setValue(updatedAuthorsRDD.count());
|
||||
|
||||
FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> {
|
||||
String orcidId = data._1().toString();
|
||||
String jsonData = data._2().toString();
|
||||
List<String> workIds = new ArrayList<>();
|
||||
Map<String, String> workIdLastModifiedDate = new HashMap<>();
|
||||
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||
String statusCode = getJsonValue(jElement, "statusCode");
|
||||
if (statusCode.equals("200")) {
|
||||
String compressedData = getJsonValue(jElement, "compressedData");
|
||||
if (StringUtils.isEmpty(compressedData)) {
|
||||
errorLoadingJsonFoundAcc.add(1);
|
||||
} else {
|
||||
String authorSummary = ArgumentApplicationParser.decompressValue(compressedData);
|
||||
if (StringUtils.isEmpty(authorSummary)) {
|
||||
errorLoadingXMLFoundAcc.add(1);
|
||||
} else {
|
||||
try {
|
||||
workIdLastModifiedDate = XMLRecordParser
|
||||
.retrieveWorkIdLastModifiedDate(authorSummary.getBytes());
|
||||
} catch (Exception e) {
|
||||
logger.error("parsing " + orcidId + " [" + jsonData + "]", e);
|
||||
errorParsingXMLFoundAcc.add(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
errorCodeFoundAcc.add(1);
|
||||
}
|
||||
parsedAuthorsAcc.add(1);
|
||||
workIdLastModifiedDate.forEach((k, v) -> {
|
||||
parsedWorksAcc.add(1);
|
||||
if (isModified(orcidId, v, lastUpdateValue)) {
|
||||
modifiedWorksAcc.add(1);
|
||||
workIds.add(orcidId.concat("/work/").concat(k));
|
||||
}
|
||||
});
|
||||
if (workIdLastModifiedDate.size() > 50) {
|
||||
maxModifiedWorksLimitAcc.add(1);
|
||||
}
|
||||
return workIds.iterator();
|
||||
};
|
||||
|
||||
Function<String, Tuple2<String, String>> downloadWorkFunction = data -> {
|
||||
String relativeWorkUrl = data;
|
||||
String orcidId = relativeWorkUrl.split("/")[0];
|
||||
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
||||
downloaded.setOrcidId(orcidId);
|
||||
downloaded.setLastModifiedDate(lastUpdateValue);
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + relativeWorkUrl);
|
||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||
long startReq = System.currentTimeMillis();
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
long endReq = System.currentTimeMillis();
|
||||
long reqTime = endReq - startReq;
|
||||
if (reqTime < 1000) {
|
||||
Thread.sleep(1000 - reqTime);
|
||||
}
|
||||
int statusCode = response.getStatusLine().getStatusCode();
|
||||
downloaded.setStatusCode(statusCode);
|
||||
if (statusCode != 200) {
|
||||
switch (statusCode) {
|
||||
case 403:
|
||||
errorHTTP403Acc.add(1);
|
||||
case 404:
|
||||
errorHTTP404Acc.add(1);
|
||||
case 409:
|
||||
errorHTTP409Acc.add(1);
|
||||
case 503:
|
||||
errorHTTP503Acc.add(1);
|
||||
case 525:
|
||||
errorHTTP525Acc.add(1);
|
||||
default:
|
||||
errorHTTPGenericAcc.add(1);
|
||||
logger
|
||||
.info(
|
||||
"Downloading " + orcidId + " status code: "
|
||||
+ response.getStatusLine().getStatusCode());
|
||||
}
|
||||
return downloaded.toTuple2();
|
||||
}
|
||||
downloadedRecordsAcc.add(1);
|
||||
downloaded
|
||||
.setCompressedData(
|
||||
ArgumentApplicationParser
|
||||
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
||||
client.close();
|
||||
return downloaded.toTuple2();
|
||||
};
|
||||
|
||||
updatedAuthorsRDD
|
||||
.flatMap(retrieveWorkUrlFunction)
|
||||
.repartition(100)
|
||||
.map(downloadWorkFunction)
|
||||
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
|
||||
.saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class);
|
||||
|
||||
logger.info("updatedAuthorsAcc: " + updatedAuthorsAcc.value().toString());
|
||||
logger.info("parsedAuthorsAcc: " + parsedAuthorsAcc.value().toString());
|
||||
logger.info("parsedWorksAcc: " + parsedWorksAcc.value().toString());
|
||||
logger.info("modifiedWorksAcc: " + modifiedWorksAcc.value().toString());
|
||||
logger.info("maxModifiedWorksLimitAcc: " + maxModifiedWorksLimitAcc.value().toString());
|
||||
logger.info("errorCodeFoundAcc: " + errorCodeFoundAcc.value().toString());
|
||||
logger.info("errorLoadingJsonFoundAcc: " + errorLoadingJsonFoundAcc.value().toString());
|
||||
logger.info("errorLoadingXMLFoundAcc: " + errorLoadingXMLFoundAcc.value().toString());
|
||||
logger.info("errorParsingXMLFoundAcc: " + errorParsingXMLFoundAcc.value().toString());
|
||||
logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
|
||||
logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
|
||||
logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
|
||||
logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
|
||||
logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
|
||||
logger.info("errorHTTPGenericAcc: " + errorHTTPGenericAcc.value().toString());
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public static boolean isModified(String orcidId, String modifiedDateValue, String lastUpdateValue) {
|
||||
LocalDate modifiedDate = null;
|
||||
LocalDate lastUpdate = null;
|
||||
try {
|
||||
modifiedDate = LocalDate.parse(modifiedDateValue, SparkDownloadOrcidWorks.ORCID_XML_DATETIMEFORMATTER);
|
||||
if (lastUpdateValue.length() != 19) {
|
||||
lastUpdateValue = lastUpdateValue.substring(0, 19);
|
||||
}
|
||||
lastUpdate = LocalDate
|
||||
.parse(lastUpdateValue, SparkDownloadOrcidWorks.LAMBDA_FILE_DATE_FORMATTER);
|
||||
} catch (Exception e) {
|
||||
logger.info("[" + orcidId + "] Parsing date: ", e.getMessage());
|
||||
throw new RuntimeException("[" + orcidId + "] Parsing date: " + e.getMessage());
|
||||
}
|
||||
return modifiedDate.isAfter(lastUpdate);
|
||||
}
|
||||
|
||||
private static String getJsonValue(JsonElement jElement, String property) {
|
||||
if (jElement.getAsJsonObject().has(property)) {
|
||||
JsonElement name = null;
|
||||
name = jElement.getAsJsonObject().get(property);
|
||||
if (name != null && !name.isJsonNull()) {
|
||||
return name.getAsString();
|
||||
}
|
||||
}
|
||||
return new String("");
|
||||
}
|
||||
}
|
@ -0,0 +1,242 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.spark.sql.functions.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkUpdateOrcidAuthors {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidAuthors.class);
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkUpdateOrcidAuthors.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
// final String outputPath = parser.get("outputPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
LongAccumulator oldAuthorsFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("old_authors_found");
|
||||
LongAccumulator updatedAuthorsFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("updated_authors_found");
|
||||
LongAccumulator newAuthorsFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("new_authors_found");
|
||||
LongAccumulator errorCodeAuthorsFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_code_authors_found");
|
||||
LongAccumulator errorLoadingAuthorsJsonFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_loading_authors_json_found");
|
||||
LongAccumulator errorParsingAuthorsXMLFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_parsing_authors_xml_found");
|
||||
|
||||
Function<Tuple2<Text, Text>, AuthorSummary> retrieveAuthorSummaryFunction = data -> {
|
||||
AuthorSummary authorSummary = new AuthorSummary();
|
||||
String orcidId = data._1().toString();
|
||||
String jsonData = data._2().toString();
|
||||
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||
String statusCode = getJsonValue(jElement, "statusCode");
|
||||
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
|
||||
if (statusCode.equals("200")) {
|
||||
String compressedData = getJsonValue(jElement, "compressedData");
|
||||
if (StringUtils.isEmpty(compressedData)) {
|
||||
errorLoadingAuthorsJsonFoundAcc.add(1);
|
||||
} else {
|
||||
String xmlAuthor = ArgumentApplicationParser.decompressValue(compressedData);
|
||||
try {
|
||||
authorSummary = XMLRecordParser
|
||||
.VTDParseAuthorSummary(xmlAuthor.getBytes());
|
||||
authorSummary.setStatusCode(statusCode);
|
||||
authorSummary.setDownloadDate(Long.toString(System.currentTimeMillis()));
|
||||
authorSummary.setBase64CompressData(compressedData);
|
||||
return authorSummary;
|
||||
} catch (Exception e) {
|
||||
logger.error("parsing xml " + orcidId + " [" + jsonData + "]", e);
|
||||
errorParsingAuthorsXMLFoundAcc.add(1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
authorSummary.setStatusCode(statusCode);
|
||||
authorSummary.setDownloadDate(Long.toString(System.currentTimeMillis()));
|
||||
errorCodeAuthorsFoundAcc.add(1);
|
||||
}
|
||||
return authorSummary;
|
||||
};
|
||||
|
||||
Dataset<AuthorSummary> downloadedAuthorSummaryDS = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class)
|
||||
.map(retrieveAuthorSummaryFunction)
|
||||
.rdd(),
|
||||
Encoders.bean(AuthorSummary.class));
|
||||
Dataset<AuthorSummary> currentAuthorSummaryDS = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(workingPath.concat("orcid_dataset/authors/*"))
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
|
||||
.rdd(),
|
||||
Encoders.bean(AuthorSummary.class));
|
||||
Dataset<AuthorSummary> mergedAuthorSummaryDS = currentAuthorSummaryDS
|
||||
.joinWith(
|
||||
downloadedAuthorSummaryDS,
|
||||
currentAuthorSummaryDS
|
||||
.col("authorData.oid")
|
||||
.equalTo(downloadedAuthorSummaryDS.col("authorData.oid")),
|
||||
"full_outer")
|
||||
.map(value -> {
|
||||
Optional<AuthorSummary> opCurrent = Optional.ofNullable(value._1());
|
||||
Optional<AuthorSummary> opDownloaded = Optional.ofNullable(value._2());
|
||||
if (!opCurrent.isPresent()) {
|
||||
newAuthorsFoundAcc.add(1);
|
||||
return opDownloaded.get();
|
||||
}
|
||||
if (!opDownloaded.isPresent()) {
|
||||
oldAuthorsFoundAcc.add(1);
|
||||
return opCurrent.get();
|
||||
}
|
||||
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
|
||||
updatedAuthorsFoundAcc.add(1);
|
||||
return opDownloaded.get();
|
||||
}
|
||||
return null;
|
||||
},
|
||||
Encoders.bean(AuthorSummary.class))
|
||||
.filter(Objects::nonNull);
|
||||
|
||||
long mergedCount = mergedAuthorSummaryDS.count();
|
||||
|
||||
Dataset<AuthorSummary> base64DedupedDS = mergedAuthorSummaryDS.dropDuplicates("base64CompressData");
|
||||
|
||||
List<String> dupOids = base64DedupedDS
|
||||
.groupBy("authorData.oid")
|
||||
.agg(count("authorData.oid").alias("oidOccurrenceCount"))
|
||||
.where("oidOccurrenceCount > 1")
|
||||
.select("oid")
|
||||
.toJavaRDD()
|
||||
.map(row -> row.get(0).toString())
|
||||
.collect();
|
||||
|
||||
JavaRDD<AuthorSummary> dupAuthors = base64DedupedDS
|
||||
.toJavaRDD()
|
||||
.filter(
|
||||
authorSummary -> (Objects.nonNull(authorSummary.getAuthorData())
|
||||
&& Objects.nonNull(authorSummary.getAuthorData().getOid())))
|
||||
.filter(authorSummary -> dupOids.contains(authorSummary.getAuthorData().getOid()));
|
||||
|
||||
Dataset<AuthorSummary> dupAuthorSummaryDS = spark
|
||||
.createDataset(
|
||||
dupAuthors.rdd(),
|
||||
Encoders.bean(AuthorSummary.class));
|
||||
List<Tuple2<String, String>> lastModifiedAuthors = dupAuthorSummaryDS
|
||||
.groupBy("authorData.oid")
|
||||
.agg(array_max(collect_list("downloadDate")))
|
||||
.map(
|
||||
row -> new Tuple2<>(row.get(0).toString(), row.get(1).toString()),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||
.toJavaRDD()
|
||||
.collect();
|
||||
|
||||
JavaRDD<AuthorSummary> lastDownloadedAuthors = base64DedupedDS
|
||||
.toJavaRDD()
|
||||
.filter(
|
||||
authorSummary -> (Objects.nonNull(authorSummary.getAuthorData())
|
||||
&& Objects.nonNull(authorSummary.getAuthorData().getOid())))
|
||||
.filter(authorSummary -> {
|
||||
boolean oidFound = lastModifiedAuthors
|
||||
.stream()
|
||||
.filter(a -> a._1().equals(authorSummary.getAuthorData().getOid()))
|
||||
.count() == 1;
|
||||
boolean tsFound = lastModifiedAuthors
|
||||
.stream()
|
||||
.filter(
|
||||
a -> a._1().equals(authorSummary.getAuthorData().getOid()) &&
|
||||
a._2().equals(authorSummary.getDownloadDate()))
|
||||
.count() == 1;
|
||||
return (oidFound && tsFound) || (!oidFound);
|
||||
});
|
||||
|
||||
Dataset<AuthorSummary> cleanedDS = spark
|
||||
.createDataset(
|
||||
lastDownloadedAuthors.rdd(),
|
||||
Encoders.bean(AuthorSummary.class))
|
||||
.dropDuplicates("downloadDate", "authorData");
|
||||
cleanedDS
|
||||
.toJavaRDD()
|
||||
.map(authorSummary -> OBJECT_MAPPER.writeValueAsString(authorSummary))
|
||||
.saveAsTextFile(workingPath.concat("orcid_dataset/new_authors"), GzipCodec.class);
|
||||
long cleanedDSCount = cleanedDS.count();
|
||||
|
||||
logger.info("report_oldAuthorsFoundAcc: " + oldAuthorsFoundAcc.value().toString());
|
||||
logger.info("report_newAuthorsFoundAcc: " + newAuthorsFoundAcc.value().toString());
|
||||
logger.info("report_updatedAuthorsFoundAcc: " + updatedAuthorsFoundAcc.value().toString());
|
||||
logger.info("report_errorCodeFoundAcc: " + errorCodeAuthorsFoundAcc.value().toString());
|
||||
logger.info("report_errorLoadingJsonFoundAcc: " + errorLoadingAuthorsJsonFoundAcc.value().toString());
|
||||
logger.info("report_errorParsingXMLFoundAcc: " + errorParsingAuthorsXMLFoundAcc.value().toString());
|
||||
logger.info("report_merged_count: " + mergedCount);
|
||||
logger.info("report_cleaned_count: " + cleanedDSCount);
|
||||
});
|
||||
}
|
||||
|
||||
private static String getJsonValue(JsonElement jElement, String property) {
|
||||
if (jElement.getAsJsonObject().has(property)) {
|
||||
JsonElement name = null;
|
||||
name = jElement.getAsJsonObject().get(property);
|
||||
if (name != null && !name.isJsonNull()) {
|
||||
return name.getAsString();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
@ -0,0 +1,317 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
|
||||
import eu.dnetlib.dhp.schema.orcid.Work;
|
||||
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkUpdateOrcidDatasets {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidDatasets.class);
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkUpdateOrcidDatasets.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
// final String outputPath = parser.get("outputPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
LongAccumulator oldAuthorsFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("old_authors_found");
|
||||
LongAccumulator updatedAuthorsFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("updated_authors_found");
|
||||
LongAccumulator newAuthorsFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("new_authors_found");
|
||||
LongAccumulator errorCodeAuthorsFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_code_authors_found");
|
||||
LongAccumulator errorLoadingAuthorsJsonFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_loading_authors_json_found");
|
||||
LongAccumulator errorParsingAuthorsXMLFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_parsing_authors_xml_found");
|
||||
|
||||
LongAccumulator oldWorksFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("old_works_found");
|
||||
LongAccumulator updatedWorksFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("updated_works_found");
|
||||
LongAccumulator newWorksFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("new_works_found");
|
||||
LongAccumulator errorCodeWorksFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_code_works_found");
|
||||
LongAccumulator errorLoadingWorksJsonFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_loading_works_json_found");
|
||||
LongAccumulator errorParsingWorksXMLFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_parsing_works_xml_found");
|
||||
|
||||
// JavaPairRDD<Text, Text> xmlSummariesRDD = sc
|
||||
// .sequenceFile(workingPath.concat("xml/authors/xml_authors.seq"), Text.class, Text.class);
|
||||
// xmlSummariesRDD
|
||||
// .map(seq -> {
|
||||
// AuthorSummary authorSummary = XMLRecordParser
|
||||
// .VTDParseAuthorSummary(seq._2().toString().getBytes());
|
||||
// authorSummary
|
||||
// .setBase64CompressData(ArgumentApplicationParser.compressArgument(seq._2().toString()));
|
||||
// return authorSummary;
|
||||
// })
|
||||
// .filter(authorSummary -> authorSummary != null)
|
||||
// .map(authorSummary -> JsonWriter.create(authorSummary))
|
||||
// .saveAsTextFile(workingPath.concat("orcid_dataset/authors"), GzipCodec.class);
|
||||
//
|
||||
// JavaPairRDD<Text, Text> xmlWorksRDD = sc
|
||||
// .sequenceFile(workingPath.concat("xml/works/*"), Text.class, Text.class);
|
||||
//
|
||||
// xmlWorksRDD
|
||||
// .map(seq -> {
|
||||
// WorkDetail workDetail = XMLRecordParserNoDoi.VTDParseWorkData(seq._2().toString().getBytes());
|
||||
// Work work = new Work();
|
||||
// work.setWorkDetail(workDetail);
|
||||
// work.setBase64CompressData(ArgumentApplicationParser.compressArgument(seq._2().toString()));
|
||||
// return work;
|
||||
// })
|
||||
// .filter(work -> work != null)
|
||||
// .map(work -> JsonWriter.create(work))
|
||||
// .saveAsTextFile(workingPath.concat("orcid_dataset/works"), GzipCodec.class);
|
||||
|
||||
// Function<Tuple2<Text, Text>, AuthorSummary> retrieveAuthorSummaryFunction = data -> {
|
||||
// AuthorSummary authorSummary = new AuthorSummary();
|
||||
// String orcidId = data._1().toString();
|
||||
// String jsonData = data._2().toString();
|
||||
// JsonElement jElement = new JsonParser().parse(jsonData);
|
||||
// String statusCode = getJsonValue(jElement, "statusCode");
|
||||
// String downloadDate = getJsonValue(jElement, "lastModifiedDate");
|
||||
// if (statusCode.equals("200")) {
|
||||
// String compressedData = getJsonValue(jElement, "compressedData");
|
||||
// if (StringUtils.isEmpty(compressedData)) {
|
||||
// errorLoadingAuthorsJsonFoundAcc.add(1);
|
||||
// } else {
|
||||
// String xmlAuthor = ArgumentApplicationParser.decompressValue(compressedData);
|
||||
// try {
|
||||
// authorSummary = XMLRecordParser
|
||||
// .VTDParseAuthorSummary(xmlAuthor.getBytes());
|
||||
// authorSummary.setStatusCode(statusCode);
|
||||
// authorSummary.setDownloadDate("2020-11-18 00:00:05.644768");
|
||||
// authorSummary.setBase64CompressData(compressedData);
|
||||
// return authorSummary;
|
||||
// } catch (Exception e) {
|
||||
// logger.error("parsing xml " + orcidId + " [" + jsonData + "]", e);
|
||||
// errorParsingAuthorsXMLFoundAcc.add(1);
|
||||
// }
|
||||
// }
|
||||
// } else {
|
||||
// authorSummary.setStatusCode(statusCode);
|
||||
// authorSummary.setDownloadDate("2020-11-18 00:00:05.644768");
|
||||
// errorCodeAuthorsFoundAcc.add(1);
|
||||
// }
|
||||
// return authorSummary;
|
||||
// };
|
||||
//
|
||||
// Dataset<AuthorSummary> downloadedAuthorSummaryDS = spark
|
||||
// .createDataset(
|
||||
// sc
|
||||
// .sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class)
|
||||
// .map(retrieveAuthorSummaryFunction)
|
||||
// .rdd(),
|
||||
// Encoders.bean(AuthorSummary.class));
|
||||
// Dataset<AuthorSummary> currentAuthorSummaryDS = spark
|
||||
// .createDataset(
|
||||
// sc
|
||||
// .textFile(workingPath.concat("orcid_dataset/authors/*"))
|
||||
// .map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class))
|
||||
// .rdd(),
|
||||
// Encoders.bean(AuthorSummary.class));
|
||||
// currentAuthorSummaryDS
|
||||
// .joinWith(
|
||||
// downloadedAuthorSummaryDS,
|
||||
// currentAuthorSummaryDS
|
||||
// .col("authorData.oid")
|
||||
// .equalTo(downloadedAuthorSummaryDS.col("authorData.oid")),
|
||||
// "full_outer")
|
||||
// .map(value -> {
|
||||
// Optional<AuthorSummary> opCurrent = Optional.ofNullable(value._1());
|
||||
// Optional<AuthorSummary> opDownloaded = Optional.ofNullable(value._2());
|
||||
// if (!opCurrent.isPresent()) {
|
||||
// newAuthorsFoundAcc.add(1);
|
||||
// return opDownloaded.get();
|
||||
// }
|
||||
// if (!opDownloaded.isPresent()) {
|
||||
// oldAuthorsFoundAcc.add(1);
|
||||
// return opCurrent.get();
|
||||
// }
|
||||
// if (opCurrent.isPresent() && opDownloaded.isPresent()) {
|
||||
// updatedAuthorsFoundAcc.add(1);
|
||||
// return opDownloaded.get();
|
||||
// }
|
||||
// return null;
|
||||
// },
|
||||
// Encoders.bean(AuthorSummary.class))
|
||||
// .filter(Objects::nonNull)
|
||||
// .toJavaRDD()
|
||||
// .map(authorSummary -> OBJECT_MAPPER.writeValueAsString(authorSummary))
|
||||
// .saveAsTextFile(workingPath.concat("orcid_dataset/new_authors"), GzipCodec.class);
|
||||
//
|
||||
// logger.info("oldAuthorsFoundAcc: " + oldAuthorsFoundAcc.value().toString());
|
||||
// logger.info("newAuthorsFoundAcc: " + newAuthorsFoundAcc.value().toString());
|
||||
// logger.info("updatedAuthorsFoundAcc: " + updatedAuthorsFoundAcc.value().toString());
|
||||
// logger.info("errorCodeFoundAcc: " + errorCodeAuthorsFoundAcc.value().toString());
|
||||
// logger.info("errorLoadingJsonFoundAcc: " + errorLoadingAuthorsJsonFoundAcc.value().toString());
|
||||
// logger.info("errorParsingXMLFoundAcc: " + errorParsingAuthorsXMLFoundAcc.value().toString());
|
||||
|
||||
Function<String, Work> retrieveWorkFunction = jsonData -> {
|
||||
Work work = new Work();
|
||||
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||
String statusCode = getJsonValue(jElement, "statusCode");
|
||||
work.setStatusCode(statusCode);
|
||||
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
|
||||
work.setDownloadDate("2020-11-18 00:00:05.644768");
|
||||
if (statusCode.equals("200")) {
|
||||
String compressedData = getJsonValue(jElement, "compressedData");
|
||||
if (StringUtils.isEmpty(compressedData)) {
|
||||
errorLoadingWorksJsonFoundAcc.add(1);
|
||||
} else {
|
||||
String xmlWork = ArgumentApplicationParser.decompressValue(compressedData);
|
||||
try {
|
||||
WorkDetail workDetail = XMLRecordParserNoDoi
|
||||
.VTDParseWorkData(xmlWork.getBytes());
|
||||
work.setWorkDetail(workDetail);
|
||||
work.setBase64CompressData(compressedData);
|
||||
return work;
|
||||
} catch (Exception e) {
|
||||
logger.error("parsing xml [" + jsonData + "]", e);
|
||||
errorParsingWorksXMLFoundAcc.add(1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
errorCodeWorksFoundAcc.add(1);
|
||||
}
|
||||
return work;
|
||||
};
|
||||
|
||||
Dataset<Work> downloadedWorksDS = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(workingPath + "downloads/updated_works/*")
|
||||
.map(s -> {
|
||||
return s.substring(21, s.length() - 1);
|
||||
})
|
||||
.map(retrieveWorkFunction)
|
||||
.rdd(),
|
||||
Encoders.bean(Work.class));
|
||||
Dataset<Work> currentWorksDS = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(workingPath.concat("orcid_dataset/works/*"))
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Work.class))
|
||||
.rdd(),
|
||||
Encoders.bean(Work.class));
|
||||
currentWorksDS
|
||||
.joinWith(
|
||||
downloadedWorksDS,
|
||||
currentWorksDS
|
||||
.col("workDetail.id")
|
||||
.equalTo(downloadedWorksDS.col("workDetail.id"))
|
||||
.and(
|
||||
currentWorksDS
|
||||
.col("workDetail.oid")
|
||||
.equalTo(downloadedWorksDS.col("workDetail.oid"))),
|
||||
"full_outer")
|
||||
.map(value -> {
|
||||
Optional<Work> opCurrent = Optional.ofNullable(value._1());
|
||||
Optional<Work> opDownloaded = Optional.ofNullable(value._2());
|
||||
if (!opCurrent.isPresent()) {
|
||||
newWorksFoundAcc.add(1);
|
||||
return opDownloaded.get();
|
||||
}
|
||||
if (!opDownloaded.isPresent()) {
|
||||
oldWorksFoundAcc.add(1);
|
||||
return opCurrent.get();
|
||||
}
|
||||
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
|
||||
updatedWorksFoundAcc.add(1);
|
||||
return opDownloaded.get();
|
||||
}
|
||||
return null;
|
||||
},
|
||||
Encoders.bean(Work.class))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.map(work -> OBJECT_MAPPER.writeValueAsString(work))
|
||||
.saveAsTextFile(workingPath.concat("orcid_dataset/new_works"), GzipCodec.class);
|
||||
|
||||
logger.info("oldWorksFoundAcc: " + oldWorksFoundAcc.value().toString());
|
||||
logger.info("newWorksFoundAcc: " + newWorksFoundAcc.value().toString());
|
||||
logger.info("updatedWorksFoundAcc: " + updatedWorksFoundAcc.value().toString());
|
||||
logger.info("errorCodeWorksFoundAcc: " + errorCodeWorksFoundAcc.value().toString());
|
||||
logger.info("errorLoadingJsonWorksFoundAcc: " + errorLoadingWorksJsonFoundAcc.value().toString());
|
||||
logger.info("errorParsingXMLWorksFoundAcc: " + errorParsingWorksXMLFoundAcc.value().toString());
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
private static String getJsonValue(JsonElement jElement, String property) {
|
||||
if (jElement.getAsJsonObject().has(property)) {
|
||||
JsonElement name = null;
|
||||
name = jElement.getAsJsonObject().get(property);
|
||||
if (name != null && !name.isJsonNull()) {
|
||||
return name.getAsString();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
@ -0,0 +1,186 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.orcid.Work;
|
||||
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
||||
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||
|
||||
public class SparkUpdateOrcidWorks {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.setSerializationInclusion(JsonInclude.Include.NON_NULL);
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidWorks.class);
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkUpdateOrcidWorks.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
LongAccumulator oldWorksFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("old_works_found");
|
||||
LongAccumulator updatedWorksFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("updated_works_found");
|
||||
LongAccumulator newWorksFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("new_works_found");
|
||||
LongAccumulator errorCodeWorksFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_code_works_found");
|
||||
LongAccumulator errorLoadingWorksJsonFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_loading_works_json_found");
|
||||
LongAccumulator errorParsingWorksXMLFoundAcc = spark
|
||||
.sparkContext()
|
||||
.longAccumulator("error_parsing_works_xml_found");
|
||||
|
||||
Function<String, Work> retrieveWorkFunction = jsonData -> {
|
||||
Work work = new Work();
|
||||
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||
String statusCode = getJsonValue(jElement, "statusCode");
|
||||
work.setStatusCode(statusCode);
|
||||
String downloadDate = getJsonValue(jElement, "lastModifiedDate");
|
||||
work.setDownloadDate(Long.toString(System.currentTimeMillis()));
|
||||
if (statusCode.equals("200")) {
|
||||
String compressedData = getJsonValue(jElement, "compressedData");
|
||||
if (StringUtils.isEmpty(compressedData)) {
|
||||
errorLoadingWorksJsonFoundAcc.add(1);
|
||||
} else {
|
||||
String xmlWork = ArgumentApplicationParser.decompressValue(compressedData);
|
||||
try {
|
||||
WorkDetail workDetail = XMLRecordParserNoDoi
|
||||
.VTDParseWorkData(xmlWork.getBytes());
|
||||
work.setWorkDetail(workDetail);
|
||||
work.setBase64CompressData(compressedData);
|
||||
return work;
|
||||
} catch (Exception e) {
|
||||
logger.error("parsing xml [" + jsonData + "]", e);
|
||||
errorParsingWorksXMLFoundAcc.add(1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
errorCodeWorksFoundAcc.add(1);
|
||||
}
|
||||
return work;
|
||||
};
|
||||
|
||||
Dataset<Work> downloadedWorksDS = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(workingPath + "downloads/updated_works/*")
|
||||
.map(s -> {
|
||||
return s.substring(21, s.length() - 1);
|
||||
})
|
||||
.map(retrieveWorkFunction)
|
||||
.rdd(),
|
||||
Encoders.bean(Work.class));
|
||||
Dataset<Work> currentWorksDS = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(workingPath.concat("orcid_dataset/works/*"))
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Work.class))
|
||||
.rdd(),
|
||||
Encoders.bean(Work.class));
|
||||
currentWorksDS
|
||||
.joinWith(
|
||||
downloadedWorksDS,
|
||||
currentWorksDS
|
||||
.col("workDetail.id")
|
||||
.equalTo(downloadedWorksDS.col("workDetail.id"))
|
||||
.and(
|
||||
currentWorksDS
|
||||
.col("workDetail.oid")
|
||||
.equalTo(downloadedWorksDS.col("workDetail.oid"))),
|
||||
"full_outer")
|
||||
.map(value -> {
|
||||
Optional<Work> opCurrent = Optional.ofNullable(value._1());
|
||||
Optional<Work> opDownloaded = Optional.ofNullable(value._2());
|
||||
if (!opCurrent.isPresent()) {
|
||||
newWorksFoundAcc.add(1);
|
||||
return opDownloaded.get();
|
||||
}
|
||||
if (!opDownloaded.isPresent()) {
|
||||
oldWorksFoundAcc.add(1);
|
||||
return opCurrent.get();
|
||||
}
|
||||
if (opCurrent.isPresent() && opDownloaded.isPresent()) {
|
||||
updatedWorksFoundAcc.add(1);
|
||||
return opDownloaded.get();
|
||||
}
|
||||
return null;
|
||||
},
|
||||
Encoders.bean(Work.class))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.map(work -> OBJECT_MAPPER.writeValueAsString(work))
|
||||
.saveAsTextFile(workingPath.concat("orcid_dataset/new_works"), GzipCodec.class);
|
||||
|
||||
logger.info("oldWorksFoundAcc: " + oldWorksFoundAcc.value().toString());
|
||||
logger.info("newWorksFoundAcc: " + newWorksFoundAcc.value().toString());
|
||||
logger.info("updatedWorksFoundAcc: " + updatedWorksFoundAcc.value().toString());
|
||||
logger.info("errorCodeWorksFoundAcc: " + errorCodeWorksFoundAcc.value().toString());
|
||||
logger.info("errorLoadingJsonWorksFoundAcc: " + errorLoadingWorksJsonFoundAcc.value().toString());
|
||||
logger.info("errorParsingXMLWorksFoundAcc: " + errorParsingWorksXMLFoundAcc.value().toString());
|
||||
|
||||
String lastModifiedDateFromLambdaFile = HDFSUtil
|
||||
.readFromTextFile(hdfsServerUri, workingPath, "last_modified_date_from_lambda_file.txt");
|
||||
HDFSUtil.writeToTextFile(hdfsServerUri, workingPath, "last_update.txt", lastModifiedDateFromLambdaFile);
|
||||
logger.info("last_update file updated");
|
||||
});
|
||||
}
|
||||
|
||||
private static String getJsonValue(JsonElement jElement, String property) {
|
||||
if (jElement.getAsJsonObject().has(property)) {
|
||||
JsonElement name = null;
|
||||
name = jElement.getAsJsonObject().get(property);
|
||||
if (name != null && !name.isJsonNull()) {
|
||||
return name.getAsString();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
@ -0,0 +1,67 @@
|
||||
|
||||
package eu.dnetlib.doiboost.orcid.util;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URI;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors;
|
||||
|
||||
public class HDFSUtil {
|
||||
|
||||
static Logger logger = LoggerFactory.getLogger(HDFSUtil.class);
|
||||
|
||||
private static FileSystem getFileSystem(String hdfsServerUri) throws IOException {
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsServerUri);
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
return fileSystem;
|
||||
}
|
||||
|
||||
public static String readFromTextFile(String hdfsServerUri, String workingPath, String path) throws IOException {
|
||||
FileSystem fileSystem = getFileSystem(hdfsServerUri);
|
||||
Path toReadPath = new Path(workingPath.concat(path));
|
||||
if (!fileSystem.exists(toReadPath)) {
|
||||
throw new RuntimeException("File not exist: " + path);
|
||||
}
|
||||
logger.info("Last_update_path " + toReadPath.toString());
|
||||
FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(toReadPath));
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
|
||||
StringBuffer sb = new StringBuffer();
|
||||
try {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
sb.append(line);
|
||||
}
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
String buffer = sb.toString();
|
||||
logger.info("Last_update: " + buffer);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
public static void writeToTextFile(String hdfsServerUri, String workingPath, String path, String text)
|
||||
throws IOException {
|
||||
FileSystem fileSystem = getFileSystem(hdfsServerUri);
|
||||
Path toWritePath = new Path(workingPath.concat(path));
|
||||
if (fileSystem.exists(toWritePath)) {
|
||||
fileSystem.delete(toWritePath, true);
|
||||
}
|
||||
FSDataOutputStream os = fileSystem.create(toWritePath);
|
||||
BufferedWriter br = new BufferedWriter(new OutputStreamWriter(os, "UTF-8"));
|
||||
br.write(text);
|
||||
br.close();
|
||||
}
|
||||
}
|
@ -0,0 +1,77 @@
|
||||
diff a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java (rejected hunks)
|
||||
@@ -30,11 +30,11 @@ public class PublicationToOaf implements Serializable {
|
||||
|
||||
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
||||
|
||||
- public static final String ORCID = "ORCID";
|
||||
- public static final String ORCID_PID_TYPE_CLASSNAME = "Open Researcher and Contributor ID";
|
||||
public final static String orcidPREFIX = "orcid_______";
|
||||
public static final String OPENAIRE_PREFIX = "openaire____";
|
||||
public static final String SEPARATOR = "::";
|
||||
+ public static final String DEACTIVATED_NAME = "Given Names Deactivated";
|
||||
+ public static final String DEACTIVATED_SURNAME = "Family Name Deactivated";
|
||||
|
||||
private String dateOfCollection = "";
|
||||
private final LongAccumulator parsedPublications;
|
||||
@@ -72,13 +81,18 @@ public class PublicationToOaf implements Serializable {
|
||||
this.errorsNotFoundAuthors = null;
|
||||
this.errorsInvalidType = null;
|
||||
this.otherTypeFound = null;
|
||||
+ this.deactivatedAcc = null;
|
||||
+ this.titleNotProvidedAcc = null;
|
||||
+ this.noUrlAcc = null;
|
||||
this.dateOfCollection = null;
|
||||
}
|
||||
|
||||
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
||||
|
||||
{
|
||||
- put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||
+ put(
|
||||
+ ModelConstants.ORCID,
|
||||
+ new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||
|
||||
}
|
||||
};
|
||||
@@ -183,6 +197,12 @@ public class PublicationToOaf implements Serializable {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
+ if (titles.stream().filter(t -> (t != null && t.equals("Title Not Supplied"))).count() > 0) {
|
||||
+ if (titleNotProvidedAcc != null) {
|
||||
+ titleNotProvidedAcc.add(1);
|
||||
+ }
|
||||
+ return null;
|
||||
+ }
|
||||
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||
publication
|
||||
.setTitle(
|
||||
@@ -527,24 +562,21 @@ public class PublicationToOaf implements Serializable {
|
||||
|
||||
private KeyValue createCollectedFrom() {
|
||||
KeyValue cf = new KeyValue();
|
||||
- cf.setValue(ORCID);
|
||||
+ cf.setValue(ModelConstants.ORCID.toUpperCase());
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
||||
return cf;
|
||||
}
|
||||
|
||||
private KeyValue createHostedBy() {
|
||||
- KeyValue hb = new KeyValue();
|
||||
- hb.setValue("Unknown Repository");
|
||||
- hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
|
||||
- return hb;
|
||||
+ return ModelConstants.UNKNOWN_REPOSITORY;
|
||||
}
|
||||
|
||||
private StructuredProperty mapAuthorId(String orcidId) {
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(orcidId);
|
||||
final Qualifier q = new Qualifier();
|
||||
- q.setClassid(ORCID.toLowerCase());
|
||||
- q.setClassname(ORCID_PID_TYPE_CLASSNAME);
|
||||
+ q.setClassid(ModelConstants.ORCID);
|
||||
+ q.setClassname(ModelConstants.ORCID_CLASSNAME);
|
||||
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||
q.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||
sp.setQualifier(q);
|
@ -1,3 +1,5 @@
|
||||
[{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path", "paramRequired": true},
|
||||
{"paramName":"a", "paramLongName":"authorsPath", "paramDescription": "the path of the authors seq file", "paramRequired": true},
|
||||
{"paramName":"xw", "paramLongName":"xmlWorksPath", "paramDescription": "the path of the works xml seq file", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputDoiAuthorListPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||
]
|
@ -1,7 +1,6 @@
|
||||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"orcidDataFolder", "paramDescription": "the folder of orcid data", "paramRequired": true},
|
||||
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||
]
|
@ -1,18 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>queueName</name>
|
||||
<value>default</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,55 +1,99 @@
|
||||
<workflow-app name="Gen_Doi_Author_List_WF" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<workflow-app name="gen_doi_author_list" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2MaxExecutors</name>
|
||||
<value>20</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath_activities}/doi_author_list'/>
|
||||
</fs>
|
||||
<ok to="Gen_Doi_Author_List"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Gen_Doi_Author_List">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Gen_Doi_Author_List</name>
|
||||
<class>eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList</class>
|
||||
<jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
|
||||
<spark-opts>--num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
|
||||
</spark-opts>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-o</arg><arg>doi_author_list/</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/doi_author_list'/>
|
||||
</fs>
|
||||
<ok to="GenDoiAuthorList"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenDoiAuthorList">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenDoiAuthorList</name>
|
||||
<class>eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.dynamicAllocation.enabled=true
|
||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||
</spark-opts>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-a</arg><arg>authors/authors.seq</arg>
|
||||
<arg>-xw</arg><arg>xml/works/*.seq</arg>
|
||||
<arg>-o</arg><arg>doi_author_list/</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -0,0 +1,163 @@
|
||||
<workflow-app name="update_orcid_datasets" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>spark2MaxExecutors</name>
|
||||
<value>50</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="promoteOrcidAuthorsDataset"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/orcid_dataset/new_authors'/>
|
||||
<delete path='${workingPath}/orcid_dataset/new_works'/>
|
||||
</fs>
|
||||
<ok to="UpdateOrcidAuthors"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="UpdateOrcidAuthors">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>UpdateOrcidAuthors</name>
|
||||
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidAuthors</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--conf spark.dynamicAllocation.enabled=true
|
||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
</spark-opts>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>-</arg>
|
||||
<arg>-o</arg><arg>-</arg>
|
||||
<arg>-t</arg><arg>-</arg>
|
||||
</spark>
|
||||
<ok to="UpdateOrcidWorks"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="UpdateOrcidWorks">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>UpdateOrcidWorks</name>
|
||||
<class>eu.dnetlib.doiboost.orcid.SparkUpdateOrcidWorks</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--conf spark.dynamicAllocation.enabled=true
|
||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
</spark-opts>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>-</arg>
|
||||
<arg>-o</arg><arg>-</arg>
|
||||
<arg>-t</arg><arg>-</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="promoteOrcidAuthorsDataset">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<prepare>
|
||||
<delete path="${workingPath}/orcid_dataset/authors"/>
|
||||
<mkdir path="${workingPath}/orcid_dataset/authors"/>
|
||||
</prepare>
|
||||
<arg>${workingPath}/orcid_dataset/new_authors/*</arg>
|
||||
<arg>${workingPath}/orcid_dataset/authors</arg>
|
||||
</distcp>
|
||||
<ok to="promoteOrcidWorksDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="promoteOrcidWorksDataset">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<prepare>
|
||||
<delete path="${workingPath}/orcid_dataset/works"/>
|
||||
<mkdir path="${workingPath}/orcid_dataset/works"/>
|
||||
</prepare>
|
||||
<arg>${workingPath}/orcid_dataset/new_works/*</arg>
|
||||
<arg>${workingPath}/orcid_dataset/works</arg>
|
||||
</distcp>
|
||||
<ok to="CleanWorkingPath"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="CleanWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}/orcid_dataset/new_authors'/>
|
||||
<delete path='${workingPath}/orcid_dataset/new_works'/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,22 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.java</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||
<value>-Xmx4g</value>
|
||||
</property>
|
||||
</configuration>
|
@ -0,0 +1 @@
|
||||
H4sIAAAAAAAAAO1c63LbNhb+n6fA6EebTE2JulpyYnXVpE2a1Jus7V5mO/0BkZCImCJVgLSidjqzf/cJ9oH2TfZJ9jsASVESLWdsddNulJlcDJxzcO4XEJMnn7+bhexaKC3j6LTWrLs1JiIv9mU0Pa19e/mV068xnfDI52EcidPaUuja58MHTxaxujqhP9g8TRwgYK/Xb/Z7TbdZY3OeBKe1hotfDn63nF6v13GOO91mg3AaK8hrqeVYhjJZntbm6TiUXo2BpUifyCgRKuLhaS1IkvlJo7FYLOqx8qSPP6eNSDdyiBxD+KnHEyPITSgFSI7jS53IyNuNVQIq8MRcCZAS/g60AibHipNAKCfiM3Ez1gomx5qJ2RgWCuT8ZqwVTKENpWK1QxO0ncN68Wy2SwF2P4eGULHaIbfdz6HnYCuGlRxfJFyG+ma8TcicwpVYLnYemAEUks+AvUNy2i5g31kfcqQvokROpNils23gnM4kjWzM3ISbARRaUWIiFEJN7FLICijH476vhN6BkwGsouhawgGdeazlbiffhMwpUMDejEW7OWSAMInV8mbgDGBlp3kYL2dQ5S5j5TA51s8pD6H62yJ9DSzH1UJdS29H8GUA6757m8cWtkGGgA7lLpOuYFbRpAVXHgV9qna47TrcikP8rMS1FItdbBZAOd44DXdYlXY3+QMBHadql/a2QGvDBwy/ntj8ceIpQdnQ8fHnsOW2UByaTtu9bLVOOv2TJqpPx/37k0YV9BqdkOvEmaFIIQLL1Jqu02pdus0T1z1xe/VOu7+iVoGzRtMybNe21x0vlPBBBP4KogyVKjkkrWioZaUSi9QYvXnjdH948bfLL1vtN98evx5dXA4KvgizkiTV0OFOVANRiRvEOhkWfBQIZnklYeNWETeUQEVp+ApZ7FPNnsZhKKaCfRNHfhxt0jKQDypOyRZN+5DIJKzQuF2+iD3JQ/aF4jJiX6W2+mLhjCepMkHNsPFXsRjHKmJfRxMeJZp9L5OAoVsx/4jThHH2FZ/JcMle2NzD4gkbpYnUM3YxF16i0hl7JjWqh1AFqyXGnjQ2WbW8v4U0VAnsxsvR2Qi8JKYhiuciytDWoUroOohVgjqnPSXnJMzwkzB5PP9kmjz+ejbHHkfSP2HfBzxhUkNShD1lZxYrxr2fU6nwb8gfiVSh97oWYTynJAkFeTCISeCa6dSDNjTjVmCdC+xnArOHo4tnj+iAKCZVTeQ7OiJNoAdxxMbQn4x0IrhPMJxdp2EkFLf9GktiLBU0odcEtkr0ERO0CONB69paEVGHVJyGlPfq7GtbPZdwJIZmh41lHMZTpOqQzYQX8AjM4jhtkEnoBVl1/XAljBI0C+P4ighBTOQeHAmtIPELWkApQ3cZkihiEithTzMeBXl0wOcgPl4SXBLxZOP8yEcoGxTxDolemjpMcobI4DjRcIVtLTLJ62wUyRmo6CT1ISn0P50KnQAIZtSp9gRsvdJehfFyy+B4JTVILAIRsamIRCK9nCWBSq3iKEMB3JVmE8sqeCnZn4foV6gZp7bFsK6XkRcAN051poisIBm9kawkqdUF/Sv2rRskKN0sgEojsKugTnAl3iGyIuuHQTrj5I0I0QQmJmduGG8u3Pr1+K2go+DVlzEZF00KSUfdrmU0slENLiercJ+twp3Yt+5kOfek8lKo3fjmhrPAl23YB6Wwv3hmQ8akjEomnwktp9ERuxAJGv7pkUklb7iC8uWcEswJMo1VhhdTCBtTG+rtXiF+xkJkebFZqJKdoxUKukOhFrAoJJ5aa1MRjSgPMDjV1Ph4wi4SdhnEM1jiRaznkuwEmWwSPmJfRtMQ5x6xVBt45gtfmgkkO6lQXk5SLxHfMxg0WZBNX6aRYK32EWu5za4Vf5ROU/hw06z160hza1IiaShNqWyqhADPIScj203S+MPzzx4ZOmRoG4V5JIfC5BBKTiSvDSIDu6bJSgU+PHcesQUo4khPpSY3ZjFgbVJnFyVfp1CD7GVnt3pQYmpCJZTRFUiAn8zHch9kC07Gns05Um6Vz5wRmdc2Z1ruzwTXKax3ws4z6vhhjr8pFxkut84gQbQIESG5Bxetv82zZjbWAXZnGI4cjthYaqlzzbKQ0shmhBfiEkVwKbgXZBIbsVINelQfQNSwbLJb7JVYswUlEiXF8YwEtuCJMSUn2slZqrPnKk7nJudnw8sR0UgUOgZyOaMA8Q7ehfYBLj2WKgmKn7THI+t4U0Pm3/8yO2bW54YlkDP6yvNPlVHOhUa1gQUuoZuJJF7R8qFciYR4AZummE5Ys8/OPwN12z48bLYRf6F4DIX4EhntR8WjqfjJVAjkW41SR25UZrXTqg/a7MeOW3ddp9Op93s/gT9xpa3b0wHOfQ/ouuzH9qDeGtAB3X5+QDkYg9hqBdIEqNeUx8z4EyUmaqaUZo2TbNWBzQqgAJwYhqgAKLiClrDZjD1M/vOPf57id6ve6T9mb7Kf0LVbUUMxAR4Kl7B9CKVNsFagteuD3jpandIpJlZTr45sijCeycsC3OgJuV8T1zzK2NViSpXRNCQmMCami0lDXubEbVcI4ME9AZeIEvNWGzn1E1Yi4ZZJgJ45ahuyVe83NyA3VFyGPT6uoloJ2u2ugVptrrz56DZ7+4JGLMoBMRX19oBSTadrnevTbZc8onpNGNXkstNklFOFZUqub84w6RmzQdZcVIXu0zjywlTbBgZGOUdavLbt8EWl1+q8GfSZj2kKGWa9aVilMkRClsxMQTTtOvLVJdVzW8gncWoSKrXdRatguxvoM+DXtqzeUvOMB290JFshuDvPkuT+Uq9LYlx/JYG6obrMVQzXNR2APdWx3X5WdWAQRLMhWtJ/NrFsDyalqcVDv7Fa2153kuVcDMdynIh3Gb31rZvwrnmYiuFfTKMVil87/nG33ez1B72+3/EHYtxqdwb+2D9u9pu+N3aPQMeMVIbWKat9gGGxRkzwMaIDnmiYOAxuh8Htzz64/fGmtMNIdhjJdo5kh/nrQ89fh2HrMGwdhq0//rB1mKz+h5OVnQ9S1EqVDSkv0Vsm7KnkSqF6c8PIS8ooaFzZ60/PoGgvQCuccJC2BuIhYhIjx0wie19blGd8gj6XfUGdQyjM0jeph940Zk8NN7HzHHnOt1ujCBxES/ZGIcLMypczMPwiBffWCy4SIaOFQGf168sYrERYfxXyVP+WcUhrnL1C6uQ6o0Bl/41QympztBRoydlLfk3lDAvfhdwHz4qDeIwKFIiM93MevYUORldxKK64sudTqQ7Yd9JLYpUdqcU8YC/4WzKekVl4aKLYWarmwTLTwrUEJ/6CK99ydYlaeCXZCIIG0qw8p3YCzdOZNwqpbTMmWULDLJ8b0T4NzOoM9THIVvlc0ZIfS1YANt1603Wbjbcc/mrdmz7z1YlAvdnv9Q0V8DhNKW0SCjV+6BjMxnUcpjORH2qWsk+DmWtsfj80IFLraMVq97jjtPtu12zl7YiirREsSrkbjY9vhrFRFiH08oGgo5QeB2WEOlj6bXM6twN4+Yvn+qyffbClGT7/ppkN6/kH0mK8L75fm9dclvzqc3sZgkwxJA0WH17NyhacMc7Q7RRgdmELzufLodstoOjH9U/Q1Szl6KXXPXqbeGm3+pt7CcBedmSfwkk9WCuY2IK7lZo1Tn4p4tCtiEPXIg7dizjli5HKQ0q23XVKRKkrlL9Qy438oaV5l4N6JGp3P3tF9HYGbLZHug3kfIhmfFJJcQ1q+y1DpZnubsP5bA+Wa7uDbrPZ6/xe1tlJ/89uAbEHA7Qc3aq7Tr/r9jrtVrvd7f5epnjPk/7sRkFtvLdRbi2pv5eN7nbwhzdZ1Y5eL2GpCotnaFdeOEdrVcffde7V06uGuZ4OGyJqlAqhbtjm1TGXL86qa3ZWHbKDjaxjd7IJw6HW20GX5WT3QQ537H2Qk90HOfHEsffXTn7X7OS3pA/fp6A8qgfJLCw9lAvXvkXQjYYcpziqXK0396qNVQJwzDO5dbB1ldqXfWsP+/KH7U3neNBpOt1W2y3xKW+mZp7s7cKueNPXeD+mM9ExrMnEvr/bHDjO4uiXOH+aVgasolM6jCf2n0JXCLYFrdDbD+3gkx+1ubsh33sduA32wazecvpuu+30Bt0dzzhvtHoV9l6tftNIeTD8/Q3fG7htRO3gLuFehb1Pw2/eFhzsfV97t52WOzh2BseDH+5g7yrsfdp7/SLoI7T2lsDV92AHzYjh2jXgQSFiWLoF/QjVsfe62G73eo47aLfuVBe3sffaELFXxSX3R2jrigaxfKN/0Aglg+KDxkeojr3PxL1O59jptbp3aZqqsPeZDMrfqj5CW28JXPWp7qAXGqbWvlR+hCrZe4/QbTc7znGv1btTj7CNvc+0sPYR+mDs+xu71Ru4Trcz6N7J2NvY+70hK70vOBh7D7di+f/ucrdbsS3svd6S2Kcjz7PHIwdz3/9SrNOnTxdu7y6JvAp7r/1ddtGx9j7oYPQ9TPjdrus00ZzfbcLfwt6n0deefh2MfX9jdzq9ntNqd9p3MvY29j6Nvfmq7//M3tvrG9/480eG5j9dG4rVf72yvvEgI0R/DB/8F4+Tql7oTQAA
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue