From 2233750a37468fd1643d8c290c8d2b720d90e4fe Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 9 Dec 2020 09:45:19 +0100 Subject: [PATCH] original orcid xml data are stored in a field of the class that models orcid data --- .../{Summary.java => AuthorHistory.java} | 2 +- .../dhp/schema/orcid/AuthorSummary.java | 23 ++++ .../dhp/schema/orcid}/Contributor.java | 8 +- .../dnetlib/dhp/schema/orcid}/ExternalId.java | 2 +- .../dnetlib/dhp/schema/orcid/OrcidData.java | 14 +++ .../dhp/schema/orcid}/PublicationDate.java | 2 +- .../eu/dnetlib/dhp/schema/orcid/Work.java | 14 +++ .../dnetlib/dhp/schema/orcid/WorkDetail.java | 9 +- .../orcid/SparkUpdateOrcidDatasets.java | 101 ++++------------ .../doiboost/orcid/json/JsonHelper.java | 4 +- .../doiboost/orcid/xml/XMLRecordParser.java | 113 +++++++++++++++++- .../orcidnodoi/ActivitiesDumpReader.java | 18 +-- .../SparkGenEnrichedOrcidWorks.java | 14 +-- .../orcidnodoi/similarity/AuthorMatcher.java | 6 +- .../orcidnodoi/xml/XMLRecordParserNoDoi.java | 12 +- .../orcid_update/oozie_app/workflow.xml | 2 +- .../orcid/xml/XMLRecordParserTest.java | 32 ++++- .../orcidnodoi/xml/OrcidNoDoiTest.java | 14 +-- 18 files changed, 264 insertions(+), 126 deletions(-) rename dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/{Summary.java => AuthorHistory.java} (96%) create mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorSummary.java rename {dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model => dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid}/Contributor.java (84%) rename {dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model => dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid}/ExternalId.java (92%) create mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidData.java rename {dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model => dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid}/PublicationDate.java (92%) create mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Work.java rename dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java => dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/WorkDetail.java (86%) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Summary.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorHistory.java similarity index 96% rename from dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Summary.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorHistory.java index ffebf50210..554aae82cf 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Summary.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorHistory.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.schema.orcid; import java.io.Serializable; -public class Summary implements Serializable { +public class AuthorHistory implements Serializable { private String creationMethod; private String completionDate; private String submissionDate; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorSummary.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorSummary.java new file mode 100644 index 0000000000..1f773b6c94 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/AuthorSummary.java @@ -0,0 +1,23 @@ + +package eu.dnetlib.dhp.schema.orcid; + +public class AuthorSummary extends OrcidData { + AuthorData authorData; + AuthorHistory authorHistory; + + public AuthorData getAuthorData() { + return authorData; + } + + public void setAuthorData(AuthorData authorData) { + this.authorData = authorData; + } + + public AuthorHistory getAuthorHistory() { + return authorHistory; + } + + public void setAuthorHistory(AuthorHistory authorHistory) { + this.authorHistory = authorHistory; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Contributor.java similarity index 84% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Contributor.java index 9222c1cc40..3b543db4b0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Contributor.java @@ -1,5 +1,5 @@ -package eu.dnetlib.doiboost.orcidnodoi.model; +package eu.dnetlib.dhp.schema.orcid; import java.io.Serializable; @@ -12,9 +12,9 @@ import eu.dnetlib.dhp.schema.orcid.AuthorData; public class Contributor extends AuthorData implements Serializable { private String sequence; private String role; - private transient boolean simpleMatch = false; - private transient Double score = 0.0; - private transient boolean bestMatch = false; + private transient boolean simpleMatch; + private transient Double score; + private transient boolean bestMatch; public String getSequence() { return sequence; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/ExternalId.java similarity index 92% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/ExternalId.java index 7fe50ce25a..8bb750b2a3 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/ExternalId.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/ExternalId.java @@ -1,5 +1,5 @@ -package eu.dnetlib.doiboost.orcidnodoi.model; +package eu.dnetlib.dhp.schema.orcid; /** * This class models the data related to external id, that are retrieved from an orcid publication diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidData.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidData.java new file mode 100644 index 0000000000..bbc7239cd6 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidData.java @@ -0,0 +1,14 @@ + +package eu.dnetlib.dhp.schema.orcid; + +public class OrcidData { + protected String base64CompressData; + + public String getBase64CompressData() { + return base64CompressData; + } + + public void setBase64CompressData(String base64CompressData) { + this.base64CompressData = base64CompressData; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/PublicationDate.java similarity index 92% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/PublicationDate.java index 5f794d8eb6..1d44676a32 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/PublicationDate.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/PublicationDate.java @@ -1,5 +1,5 @@ -package eu.dnetlib.doiboost.orcidnodoi.model; +package eu.dnetlib.dhp.schema.orcid; /** * This class models the data related to a publication date, that are retrieved from an orcid publication diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Work.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Work.java new file mode 100644 index 0000000000..a0953a4652 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/Work.java @@ -0,0 +1,14 @@ + +package eu.dnetlib.dhp.schema.orcid; + +public class Work extends OrcidData { + WorkDetail workDetail; + + public WorkDetail getWorkDetail() { + return workDetail; + } + + public void setWorkDetail(WorkDetail workDetail) { + this.workDetail = workDetail; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/WorkDetail.java similarity index 86% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/WorkDetail.java index 58f992d12a..614d415c15 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/WorkDetail.java @@ -1,14 +1,19 @@ -package eu.dnetlib.doiboost.orcidnodoi.model; +package eu.dnetlib.dhp.schema.orcid; import java.io.Serializable; import java.util.List; +import eu.dnetlib.dhp.schema.orcid.Contributor; +import eu.dnetlib.dhp.schema.orcid.ExternalId; +import eu.dnetlib.dhp.schema.orcid.OrcidData; +import eu.dnetlib.dhp.schema.orcid.PublicationDate; + /** * This class models the data that are retrieved from orcid publication */ -public class WorkDataNoDoi implements Serializable { +public class WorkDetail implements Serializable { private String oid; private String id; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkUpdateOrcidDatasets.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkUpdateOrcidDatasets.java index ed7114b272..d479a91024 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkUpdateOrcidDatasets.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkUpdateOrcidDatasets.java @@ -4,44 +4,27 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.esotericsoftware.minlog.Log; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.orcid.AuthorData; -import eu.dnetlib.doiboost.orcid.model.WorkData; +import eu.dnetlib.dhp.schema.orcid.AuthorSummary; +import eu.dnetlib.dhp.schema.orcid.Work; +import eu.dnetlib.dhp.schema.orcid.WorkDetail; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; -import scala.Tuple2; public class SparkUpdateOrcidDatasets { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws IOException, Exception { Logger logger = LoggerFactory.getLogger(SparkUpdateOrcidDatasets.class); logger.info("[ SparkUpdateOrcidDatasets STARTED]"); @@ -70,71 +53,35 @@ public class SparkUpdateOrcidDatasets { spark -> { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true"); - JavaPairRDD xmlSummariesRDD = sc .sequenceFile(workingPath.concat("xml/authors/xml_authors.seq"), Text.class, Text.class); xmlSummariesRDD - .repartition(5) - .map(seq -> XMLRecordParser.VTDParseAuthorData(seq._2().toString().getBytes())) - .filter(summary -> summary != null) - .mapToPair( - summary -> new Tuple2<>(summary.getOid(), - OBJECT_MAPPER.writeValueAsString(summary))) - .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) - .saveAsNewAPIHadoopFile( - workingPath.concat("orcid_dataset/authors"), - Text.class, - Text.class, - SequenceFileOutputFormat.class, - sc.hadoopConfiguration()); + .map(seq -> { + AuthorSummary authorSummary = XMLRecordParser + .VTDParseAuthorSummary(seq._2().toString().getBytes()); + authorSummary + .setBase64CompressData(ArgumentApplicationParser.compressArgument(seq._2().toString())); + return authorSummary; + }) + .filter(authorSummary -> authorSummary != null) + .map(authorSummary -> JsonWriter.create(authorSummary)) + .saveAsTextFile(workingPath.concat("orcid_dataset/authors"), GzipCodec.class); JavaPairRDD xmlWorksRDD = sc .sequenceFile(workingPath.concat("xml/works/*"), Text.class, Text.class); xmlWorksRDD - .map(seq -> XMLRecordParserNoDoi.VTDParseWorkData(seq._2().toString().getBytes())) + .map(seq -> { + WorkDetail workDetail = XMLRecordParserNoDoi.VTDParseWorkData(seq._2().toString().getBytes()); + Work work = new Work(); + work.setWorkDetail(workDetail); + work.setBase64CompressData(ArgumentApplicationParser.compressArgument(seq._2().toString())); + return work; + }) .filter(work -> work != null) - .mapToPair( - work -> new Tuple2<>(work.getOid().concat("_").concat(work.getId()), - OBJECT_MAPPER.writeValueAsString(work))) - .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) - .saveAsNewAPIHadoopFile( - workingPath.concat("orcid_dataset/works"), - Text.class, - Text.class, - SequenceFileOutputFormat.class, - sc.hadoopConfiguration()); + .map(work -> JsonWriter.create(work)) + .saveAsTextFile(workingPath.concat("orcid_dataset/works"), GzipCodec.class); }); } - - private static AuthorData loadAuthorFromJson(Text orcidId, Text json) { - AuthorData authorData = new AuthorData(); - authorData.setOid(orcidId.toString()); - JsonElement jElement = new JsonParser().parse(json.toString()); - authorData.setName(getJsonValue(jElement, "name")); - authorData.setSurname(getJsonValue(jElement, "surname")); - authorData.setCreditName(getJsonValue(jElement, "creditname")); - return authorData; - } - - private static WorkData loadWorkFromJson(Text orcidId, Text json) { - WorkData workData = new WorkData(); - workData.setOid(orcidId.toString()); - JsonElement jElement = new JsonParser().parse(json.toString()); - workData.setDoi(getJsonValue(jElement, "doi")); - return workData; - } - - private static String getJsonValue(JsonElement jElement, String property) { - if (jElement.getAsJsonObject().has(property)) { - JsonElement name = null; - name = jElement.getAsJsonObject().get(property); - if (name != null && !name.isJsonNull()) { - return name.getAsString(); - } - } - return null; - } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java index 94f7d8c913..a2342f7b4d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java @@ -3,11 +3,11 @@ package eu.dnetlib.doiboost.orcid.json; import com.google.gson.Gson; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.dhp.schema.orcid.WorkDetail; public class JsonHelper { - public static String createOidWork(WorkDataNoDoi workData) { + public static String createOidWork(WorkDetail workData) { return new Gson().toJson(workData); } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java index b6acadb725..c98d63b913 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java @@ -12,8 +12,9 @@ import com.ximpleware.*; import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.schema.orcid.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorHistory; +import eu.dnetlib.dhp.schema.orcid.AuthorSummary; import eu.dnetlib.doiboost.orcid.model.WorkData; -import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; public class XMLRecordParser { @@ -234,4 +235,114 @@ public class XMLRecordParser { } return workIdLastModifiedDate; } + + public static AuthorSummary VTDParseAuthorSummary(byte[] bytes) + throws VtdException, ParseException { + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); + ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL); + ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL); + ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL); + ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL); + ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); + ap.declareXPathNameSpace(NS_HISTORY, NS_HISTORY_URL); + + AuthorData authorData = retrieveAuthorData(ap, vn, bytes); + AuthorHistory authorHistory = retrieveAuthorHistory(ap, vn, bytes); + AuthorSummary authorSummary = new AuthorSummary(); + authorSummary.setAuthorData(authorData); + authorSummary.setAuthorHistory(authorHistory); + return authorSummary; + } + + private static AuthorData retrieveAuthorData(AutoPilot ap, VTDNav vn, byte[] bytes) + throws VtdException { + AuthorData authorData = new AuthorData(); + final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); + if (!errors.isEmpty()) { + authorData.setErrorCode(errors.get(0)); + return authorData; + } + + List recordNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//record:record", Arrays.asList("path")); + if (!recordNodes.isEmpty()) { + final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1); + authorData.setOid(oid); + } else { + return null; + } + + final List names = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names"); + if (!names.isEmpty()) { + authorData.setName(names.get(0)); + } + + final List surnames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name"); + if (!surnames.isEmpty()) { + authorData.setSurname(surnames.get(0)); + } + + final List creditNames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name"); + if (!creditNames.isEmpty()) { + authorData.setCreditName(creditNames.get(0)); + } + + final List otherNames = VtdUtilityParser.getTextValue(ap, vn, "//other-name:content"); + if (!otherNames.isEmpty()) { + authorData.setOtherNames(otherNames); + } + return authorData; + } + + private static AuthorHistory retrieveAuthorHistory(AutoPilot ap, VTDNav vn, byte[] bytes) + throws VtdException { + AuthorHistory authorHistory = new AuthorHistory(); + final String creationMethod = VtdUtilityParser.getSingleValue(ap, vn, "//history:creation-method"); + if (StringUtils.isNoneBlank(creationMethod)) { + authorHistory.setCreationMethod(creationMethod); + } + + final String completionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:completion-date"); + if (StringUtils.isNoneBlank(completionDate)) { + authorHistory.setCompletionDate(completionDate); + } + + final String submissionDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:submission-date"); + if (StringUtils.isNoneBlank(submissionDate)) { + authorHistory.setSubmissionDate(submissionDate); + } + + final String claimed = VtdUtilityParser.getSingleValue(ap, vn, "//history:claimed"); + if (StringUtils.isNoneBlank(claimed)) { + authorHistory.setClaimed(Boolean.parseBoolean(claimed)); + } + + final String verifiedEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-email"); + if (StringUtils.isNoneBlank(verifiedEmail)) { + authorHistory.setVerifiedEmail(Boolean.parseBoolean(verifiedEmail)); + } + + final String verifiedPrimaryEmail = VtdUtilityParser.getSingleValue(ap, vn, "//history:verified-primary-email"); + if (StringUtils.isNoneBlank(verifiedPrimaryEmail)) { + authorHistory.setVerifiedPrimaryEmail(Boolean.parseBoolean(verifiedPrimaryEmail)); + } + + final String deactivationDate = VtdUtilityParser.getSingleValue(ap, vn, "//history:deactivation-date"); + if (StringUtils.isNoneBlank(deactivationDate)) { + authorHistory.setDeactivationDate(deactivationDate); + } + + final String lastModifiedDate = VtdUtilityParser + .getSingleValue(ap, vn, "//history:history/common:last-modified-date"); + if (StringUtils.isNoneBlank(lastModifiedDate)) { + authorHistory.setLastModifiedDate(lastModifiedDate); + } + return authorHistory; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index c2cfafd874..04a3389ed0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -19,8 +19,8 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; +import eu.dnetlib.dhp.schema.orcid.WorkDetail; import eu.dnetlib.doiboost.orcid.json.JsonHelper; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; /** @@ -87,29 +87,29 @@ public class ActivitiesDumpReader { while ((line = br.readLine()) != null) { buffer.append(line); } - WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi + WorkDetail workDetail = XMLRecordParserNoDoi .VTDParseWorkData(buffer.toString().getBytes()); - if (workDataNoDoi != null) { - if (workDataNoDoi.getErrorCode() != null) { + if (workDetail != null) { + if (workDetail.getErrorCode() != null) { errorFromOrcidFound += 1; Log .debug( "error from Orcid with code " - + workDataNoDoi.getErrorCode() + + workDetail.getErrorCode() + " for entry " + entry.getName()); continue; } - boolean isDoiFound = workDataNoDoi + boolean isDoiFound = workDetail .getExtIds() .stream() .filter(e -> e.getType() != null) .anyMatch(e -> e.getType().equals("doi")); if (!isDoiFound) { - String jsonData = JsonHelper.createOidWork(workDataNoDoi); - Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData); + String jsonData = JsonHelper.createOidWork(workDetail); + Log.debug("oid: " + workDetail.getOid() + " data: " + jsonData); - final Text key = new Text(workDataNoDoi.getOid()); + final Text key = new Text(workDetail.getOid()); final Text value = new Text(jsonData); try { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index 2d26adce6a..d588920276 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -30,8 +30,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.orcid.AuthorData; +import eu.dnetlib.dhp.schema.orcid.WorkDetail; import eu.dnetlib.doiboost.orcid.json.JsonHelper; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; @@ -81,10 +81,10 @@ public class SparkGenEnrichedOrcidWorks { JavaPairRDD activitiesRDD = sc .sequenceFile(workingPath + outputWorksPath + "*.seq", Text.class, Text.class); - Dataset activitiesDataset = spark + Dataset activitiesDataset = spark .createDataset( activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), - Encoders.bean(WorkDataNoDoi.class)); + Encoders.bean(WorkDetail.class)); logger.info("Works data loaded: " + activitiesDataset.count()); JavaRDD> enrichedWorksRDD = activitiesDataset @@ -92,8 +92,8 @@ public class SparkGenEnrichedOrcidWorks { summariesDataset, activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") .map( - (MapFunction, Tuple2>) value -> { - WorkDataNoDoi w = value._1; + (MapFunction, Tuple2>) value -> { + WorkDetail w = value._1; AuthorData a = value._2; AuthorMatcher.match(a, w.getContributors()); return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w)); @@ -161,9 +161,9 @@ public class SparkGenEnrichedOrcidWorks { return authorData; } - private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) { + private static WorkDetail loadWorkFromJson(Text orcidId, Text json) { - WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class); + WorkDetail workData = new Gson().fromJson(json.toString(), WorkDetail.class); return workData; } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java index c0f6178684..e36ed3bbfd 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -19,8 +19,8 @@ import com.ximpleware.XPathParseException; import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.schema.orcid.AuthorData; -import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.dhp.schema.orcid.Contributor; +import eu.dnetlib.dhp.schema.orcid.WorkDetail; /** * This class is used for searching from a list of publication contributors a @@ -209,7 +209,7 @@ public class AuthorMatcher { } } - private static String toJson(WorkDataNoDoi work) { + private static String toJson(WorkDetail work) { GsonBuilder builder = new GsonBuilder(); Gson gson = builder.create(); return gson.toJson(work); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java index f4b0934020..15cd4f268c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/xml/XMLRecordParserNoDoi.java @@ -12,10 +12,10 @@ import com.ximpleware.*; import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; -import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; -import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId; -import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.dhp.schema.orcid.Contributor; +import eu.dnetlib.dhp.schema.orcid.ExternalId; +import eu.dnetlib.dhp.schema.orcid.PublicationDate; +import eu.dnetlib.dhp.schema.orcid.WorkDetail; /** * This class is used for parsing xml data with vtd parser @@ -42,7 +42,7 @@ public class XMLRecordParserNoDoi { private static final String NS_ERROR = "error"; - public static WorkDataNoDoi VTDParseWorkData(byte[] bytes) + public static WorkDetail VTDParseWorkData(byte[] bytes) throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException, NavException, XPathEvalException { final VTDGen vg = new VTDGen(); @@ -54,7 +54,7 @@ public class XMLRecordParserNoDoi { ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL); ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); - WorkDataNoDoi workData = new WorkDataNoDoi(); + WorkDetail workData = new WorkDetail(); final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); if (!errors.isEmpty()) { workData.setErrorCode(errors.get(0)); diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_update/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_update/oozie_app/workflow.xml index d2238a3783..12441284c9 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_update/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_update/oozie_app/workflow.xml @@ -2,7 +2,7 @@ spark2MaxExecutors - 5 + 40 sparkDriverMemory diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java index 722e9fd342..0bcce35f50 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java @@ -5,21 +5,23 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; -import java.time.LocalDate; -import java.util.HashMap; import java.util.Map; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; import com.fasterxml.jackson.databind.ObjectMapper; -import com.ximpleware.*; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.orcid.AuthorData; +import eu.dnetlib.dhp.schema.orcid.AuthorSummary; +import eu.dnetlib.dhp.schema.orcid.Work; +import eu.dnetlib.dhp.schema.orcid.WorkDetail; import eu.dnetlib.doiboost.orcid.OrcidClientTest; import eu.dnetlib.doiboost.orcid.SparkDownloadOrcidWorks; import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; public class XMLRecordParserTest { private static final String NS_WORK = "work"; @@ -29,7 +31,7 @@ public class XMLRecordParserTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @Test - public void testOrcidAuthorDataXMLParser() throws Exception { + private void testOrcidAuthorDataXMLParser() throws Exception { String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml")); @@ -104,4 +106,26 @@ public class XMLRecordParserTest { } }); } + + @Test + public void testAuthorSummaryXMLParser() throws Exception { + String xml = IOUtils + .toString( + this.getClass().getResourceAsStream("record_0000-0001-5004-5918.xml")); + AuthorSummary authorSummary = XMLRecordParser.VTDParseAuthorSummary(xml.getBytes()); + authorSummary.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml)); + OrcidClientTest.logToFile(JsonWriter.create(authorSummary)); + } + + @Test + public void testWorkDataXMLParser() throws Exception { + String xml = IOUtils + .toString( + this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); + WorkDetail workDetail = XMLRecordParserNoDoi.VTDParseWorkData(xml.getBytes()); + Work work = new Work(); + work.setWorkDetail(workDetail); + work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml)); + OrcidClientTest.logToFile(JsonWriter.create(work)); + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index 1f77197ab5..efe01522c7 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -21,8 +21,8 @@ import com.ximpleware.XPathParseException; import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.schema.orcid.AuthorData; -import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.dhp.schema.orcid.Contributor; +import eu.dnetlib.dhp.schema.orcid.WorkDetail; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; public class OrcidNoDoiTest { @@ -48,7 +48,7 @@ public class OrcidNoDoiTest { if (p == null) { logger.info("XMLRecordParserNoDoi null"); } - WorkDataNoDoi workData = null; + WorkDetail workData = null; try { workData = p.VTDParseWorkData(xml.getBytes()); } catch (Exception e) { @@ -105,7 +105,7 @@ public class OrcidNoDoiTest { if (p == null) { logger.info("XMLRecordParserNoDoi null"); } - WorkDataNoDoi workData = null; + WorkDetail workData = null; try { workData = p.VTDParseWorkData(xml.getBytes()); } catch (Exception e) { @@ -136,7 +136,7 @@ public class OrcidNoDoiTest { if (p == null) { logger.info("XMLRecordParserNoDoi null"); } - WorkDataNoDoi workData = null; + WorkDetail workData = null; try { workData = p.VTDParseWorkData(xml.getBytes()); } catch (Exception e) { @@ -179,7 +179,7 @@ public class OrcidNoDoiTest { if (p == null) { logger.info("XMLRecordParserNoDoi null"); } - WorkDataNoDoi workData = null; + WorkDetail workData = null; try { workData = p.VTDParseWorkData(xml.getBytes()); } catch (Exception e) { @@ -308,7 +308,7 @@ public class OrcidNoDoiTest { if (p == null) { logger.info("XMLRecordParserNoDoi null"); } - WorkDataNoDoi workData = null; + WorkDetail workData = null; try { workData = p.VTDParseWorkData(xml.getBytes()); } catch (Exception e) {