workflow to generate seq(doi,AuthorList)

2020-05-19 09:34:44 +02:00 · 2020-05-19 09:34:44 +02:00 · 7362bc3e9d
parent d4e9075f22
commit 7362bc3e9d
4 changed files with 190 additions and 0 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
@ -0,0 +1,114 @@
+
+package eu.dnetlib.doiboost.orcid;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.util.LongAccumulator;
+import org.mortbay.log.Log;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.JsonElement;
+import com.google.gson.JsonParser;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation;
+import eu.dnetlib.doiboost.orcid.model.AuthorData;
+import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
+import eu.dnetlib.doiboost.orcid.model.WorkData;
+import scala.Tuple2;
+
+public class SparkGenerateDoiAuthorList {
+
+	public static void main(String[] args) throws IOException, Exception {
+		Logger logger = LoggerFactory.getLogger(SparkGenerateDoiAuthorList.class);
+		logger.info("[ SparkGenerateDoiAuthorList STARTED]");
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					SparkGenerateDoiAuthorList.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/doiboost/gen_doi_author_list_orcid_parameters.json")));
+		parser.parseArgument(args);
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+		final String workingPath = parser.get("workingPath");
+		logger.info("workingPath: ", workingPath);
+		final String outputDoiAuthorListPath = parser.get("outputDoiAuthorListPath");
+		logger.info("outputDoiAuthorListPath: ", outputDoiAuthorListPath);
+		
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+				JavaPairRDD<Text, Text> summariesRDD = sc.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
+				Dataset<AuthorData> summariesDataset = spark
+				.createDataset(summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), 
+						Encoders.bean(AuthorData.class));
+				
+				JavaPairRDD<Text, Text> activitiesRDD = sc.sequenceFile(workingPath + "/output/*.seq", Text.class, Text.class);
+				Dataset<WorkData> activitiesDataset = spark
+						.createDataset(activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), 
+								Encoders.bean(WorkData.class));
+
+			});
+
+	}
+	
+	private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
+		AuthorData authorData = new AuthorData();
+		authorData.setOid(orcidId.toString());
+		JsonElement jElement = new JsonParser().parse(json.toString());
+		authorData.setName(getJsonValue(jElement, "name"));
+		authorData.setSurname(getJsonValue(jElement, "surname"));
+		authorData.setCreditName(getJsonValue(jElement, "creditname"));
+		return authorData;
+	}
+	
+	private static WorkData loadWorkFromJson(Text orcidId, Text json) {
+		WorkData workData = new WorkData();
+		workData.setOid(orcidId.toString());
+		JsonElement jElement = new JsonParser().parse(json.toString());
+		workData.setDoi(getJsonValue(jElement, "doi"));
+		return workData;
+	}
+	
+	private static String getJsonValue(JsonElement jElement, String property) {
+		if (jElement.getAsJsonObject().has(property)) {
+			JsonElement name = null;
+			name = jElement.getAsJsonObject().get(property);
+			if (name != null && name.isJsonObject()) {
+				return name.getAsString();
+			}
+		}
+		return null;
+	}
+}
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_doi_author_list_orcid_parameters.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_doi_author_list_orcid_parameters.json
@ -0,0 +1,3 @@
+[{"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the working path",	"paramRequired": true},
+ {"paramName":"o",   "paramLongName":"outputDoiAuthorListPath",	"paramDescription": "the relative folder of the sequencial file to write the data",	"paramRequired": true}
+]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/config-default.xml
@ -0,0 +1,18 @@
+<configuration>
+    <property>
+            <name>jobTracker</name>
+            <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
+    </property>
+    <property>
+            <name>nameNode</name>
+            <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
+    </property>
+    <property>
+            <name>queueName</name>
+            <value>default</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml
@ -0,0 +1,55 @@
+<workflow-app name="Gen Orcid Authors" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath</name>
+            <description>the working dir base path</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+    </parameters>
+    
+    <start to="ResetWorkingPath"/>
+    
+    
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${workingPath_activities}/doi_author_list'/>
+        </fs>
+        <ok to="Gen_Doi_Author_List"/>
+        <error to="Kill"/>
+    </action>
+    
+	<action name="Gen_Doi_Author_List">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+        	<job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Gen_Doi_Author_List</name>
+            <class>eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList</class>
+            <jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
+            <spark-opts>--num-executors 1 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
+            </spark-opts>
+            <arg>-w</arg><arg>${workingPath}/</arg>
+            <arg>-o</arg><arg>doi_author_list/</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    
+   <end name="End"/>
+</workflow-app>