From 5c65e602d385b5948707da4fbd24a4106a00ca89 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 7 Dec 2020 15:28:10 +0100 Subject: [PATCH] wf doi_authors generates one json data foreach row --- .../eu/dnetlib/dhp/schema/orcid/OrcidDOI.java | 29 ++++++++++--------- .../orcid/SparkGenerateDoiAuthorList.java | 21 ++++++-------- .../oozie_app/workflow.xml | 2 +- 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java index 11bce26c8..cf372c12a 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java @@ -1,24 +1,25 @@ + package eu.dnetlib.dhp.schema.orcid; import java.util.List; public class OrcidDOI { - private String doi; - private List authors; + private String doi; + private List authors; - public String getDoi() { - return doi; - } + public String getDoi() { + return doi; + } - public void setDoi(String doi) { - this.doi = doi; - } + public void setDoi(String doi) { + this.doi = doi; + } - public List getAuthors() { - return authors; - } + public List getAuthors() { + return authors; + } - public void setAuthors(List authors) { - this.authors = authors; - } + public void setAuthors(List authors) { + this.authors = authors; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java index 4201ffb07..d831f8509 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java @@ -3,37 +3,32 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.esotericsoftware.minlog.Log; -import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.JsonElement; import com.google.gson.JsonParser; -import com.ximpleware.ParseException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.schema.orcid.AuthorData; +import eu.dnetlib.dhp.schema.orcid.OrcidDOI; import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; @@ -154,11 +149,13 @@ public class SparkGenerateDoiAuthorList { authorList.removeIf(a -> !oidsAlreadySeen.add(a.getOid())); return new Tuple2<>(s._1(), authorList); }) - .mapToPair( - s -> { - return new Tuple2<>(s._1(), JsonWriter.create(s._2())); - }) - .saveAsTextFile(workingPath + outputDoiAuthorListPath); + .map(s -> { + OrcidDOI orcidDOI = new OrcidDOI(); + orcidDOI.setDoi(s._1()); + orcidDOI.setAuthors(s._2()); + return JsonWriter.create(orcidDOI); + }) + .saveAsTextFile(workingPath + outputDoiAuthorListPath, GzipCodec.class); }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml index a466db7f6..133a6f4bd 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml @@ -14,7 +14,7 @@ spark2MaxExecutors - 40 + 20 oozieActionShareLibForSpark2