workflow to generate seq(doi,AuthorList)

2020-05-19 09:34:44 +02:00 · 2020-05-19 09:34:44 +02:00 · 7362bc3e9d
parent d4e9075f22
commit 7362bc3e9d
4 changed files with 190 additions and 0 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkGenerateDoiAuthorList.java
@ -0,0 +1,114 @@
 package eu.dnetlib.doiboost.orcid;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.List;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.util.LongAccumulator;
 import org.mortbay.log.Log;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation;
 import eu.dnetlib.doiboost.orcid.model.AuthorData;
 import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
 import eu.dnetlib.doiboost.orcid.model.WorkData;
 import scala.Tuple2;
 public class SparkGenerateDoiAuthorList {
 	public static void main(String[] args) throws IOException, Exception {
 		Logger logger = LoggerFactory.getLogger(SparkGenerateDoiAuthorList.class);
 		logger.info("[ SparkGenerateDoiAuthorList STARTED]");
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					SparkGenerateDoiAuthorList.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/doiboost/gen_doi_author_list_orcid_parameters.json")));
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String workingPath = parser.get("workingPath");
 		logger.info("workingPath: ", workingPath);
 		final String outputDoiAuthorListPath = parser.get("outputDoiAuthorListPath");
 		logger.info("outputDoiAuthorListPath: ", outputDoiAuthorListPath);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 				JavaPairRDD<Text, Text> summariesRDD = sc.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
 				Dataset<AuthorData> summariesDataset = spark
 				.createDataset(summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), 
 						Encoders.bean(AuthorData.class));
 				JavaPairRDD<Text, Text> activitiesRDD = sc.sequenceFile(workingPath + "/output/*.seq", Text.class, Text.class);
 				Dataset<WorkData> activitiesDataset = spark
 						.createDataset(activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), 
 								Encoders.bean(WorkData.class));
 			});
 	}
 	private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
 		AuthorData authorData = new AuthorData();
 		authorData.setOid(orcidId.toString());
 		JsonElement jElement = new JsonParser().parse(json.toString());
 		authorData.setName(getJsonValue(jElement, "name"));
 		authorData.setSurname(getJsonValue(jElement, "surname"));
 		authorData.setCreditName(getJsonValue(jElement, "creditname"));
 		return authorData;
 	}
 	private static WorkData loadWorkFromJson(Text orcidId, Text json) {
 		WorkData workData = new WorkData();
 		workData.setOid(orcidId.toString());
 		JsonElement jElement = new JsonParser().parse(json.toString());
 		workData.setDoi(getJsonValue(jElement, "doi"));
 		return workData;
 	}
 	private static String getJsonValue(JsonElement jElement, String property) {
 		if (jElement.getAsJsonObject().has(property)) {
 			JsonElement name = null;
 			name = jElement.getAsJsonObject().get(property);
 			if (name != null && name.isJsonObject()) {
 				return name.getAsString();
 			}
 		}
 		return null;
 	}
 }
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_doi_author_list_orcid_parameters.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_doi_author_list_orcid_parameters.json
@ -0,0 +1,3 @@
 [{"paramName":"w",   "paramLongName":"workingPath",	"paramDescription": "the working path",	"paramRequired": true},
 {"paramName":"o",   "paramLongName":"outputDoiAuthorListPath",	"paramDescription": "the relative folder of the sequencial file to write the data",	"paramRequired": true}
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/config-default.xml
@ -0,0 +1,18 @@
 <configuration>
    <property>
            <name>jobTracker</name>
            <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
    </property>
    <property>
            <name>nameNode</name>
            <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
    </property>
    <property>
            <name>queueName</name>
            <value>default</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_doi_author_list/oozie_app/workflow.xml
@ -0,0 +1,55 @@
 <workflow-app name="Gen Orcid Authors" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>workingPath</name>
            <description>the working dir base path</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
    </parameters>
    <start to="ResetWorkingPath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ResetWorkingPath">
        <fs>
            <delete path='${workingPath_activities}/doi_author_list'/>
        </fs>
        <ok to="Gen_Doi_Author_List"/>
        <error to="Kill"/>
    </action>
 	<action name="Gen_Doi_Author_List">
        <spark xmlns="uri:oozie:spark-action:0.2">
        	<job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Gen_Doi_Author_List</name>
            <class>eu.dnetlib.doiboost.orcid.SparkGenerateDoiAuthorList</class>
            <jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
            <spark-opts>--num-executors 1 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
            </spark-opts>
            <arg>-w</arg><arg>${workingPath}/</arg>
            <arg>-o</arg><arg>doi_author_list/</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
   <end name="End"/>
 </workflow-app>