collecting all the atoic actions for result type and save them all in the AS path

2020-12-01 14:29:18 +01:00 · 2020-12-01 14:29:18 +01:00 · 45d06c45c7
parent 0051ebede5
commit 45d06c45c7
4 changed files with 338 additions and 0 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/CollectAndSave.java
@ -0,0 +1,85 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.Serializable;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.oaf.Result;
+
+public class CollectAndSave implements Serializable {
+
+	private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	public static <I extends Result> void main(String[] args) throws Exception {
+
+		String jsonConfiguration = IOUtils
+			.toString(
+				CollectAndSave.class
+					.getResourceAsStream(
+						"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath {}: ", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}: ", outputPath);
+
+		SparkConf conf = new SparkConf();
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				removeOutputDir(spark, outputPath);
+				collectAndSave(spark, inputPath, outputPath);
+			});
+	}
+
+	private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
+		JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+		sc
+			.sequenceFile(inputPath + "/publication", Text.class, Text.class)
+			.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
+			.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
+			.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
+			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
+		;
+	}
+
+	private static void removeOutputDir(SparkSession spark, String path) {
+		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
@ -0,0 +1,32 @@
+[
+  {
+  "paramName": "issm",
+  "paramLongName": "isSparkSessionManaged",
+  "paramDescription": "when true will stop SparkSession after job execution",
+  "paramRequired": false
+},
+{
+"paramName": "ip",
+"paramLongName": "inputPath",
+"paramDescription": "the URL from where to get the programme file",
+"paramRequired": true
+},
+{
+"paramName": "o",
+"paramLongName": "outputPath",
+"paramDescription": "the path of the new ActionSet",
+"paramRequired": true
+},
+  {
+    "paramName": "rtn",
+    "paramLongName": "resultTableName",
+    "paramDescription": "the path of the new ActionSet",
+    "paramRequired": true
+  },
+  {
+    "paramName": "bsp",
+    "paramLongName": "bipScorePath",
+    "paramDescription": "the path of the new ActionSet",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml
@ -0,0 +1,58 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>spark2YarnHistoryServerAddress</name>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
+    </property>
+    <property>
+        <name>spark2ExtraListeners</name>
+        <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+    </property>
+    <property>
+        <name>spark2SqlQueryExecutionListeners</name>
+        <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>sparkExecutorNumber</name>
+        <value>4</value>
+    </property>
+    <property>
+        <name>spark2EventLogDir</name>
+        <value>/user/spark/spark2ApplicationHistory</value>
+    </property>
+    <property>
+        <name>sparkDriverMemory</name>
+        <value>15G</value>
+    </property>
+    <property>
+        <name>sparkExecutorMemory</name>
+        <value>6G</value>
+    </property>
+    <property>
+        <name>sparkExecutorCores</name>
+        <value>1</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
@ -0,0 +1,163 @@
+<workflow-app name="H2020Programme" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>projectFileURL</name>
+            <description>the url where to get the projects file</description>
+        </property>
+
+        <property>
+            <name>programmeFileURL</name>
+            <description>the url where to get the programme file</description>
+        </property>
+
+        <property>
+            <name>topicFileURL</name>
+            <description>the url where to get the topic file</description>
+        </property>
+        <property>
+            <name>outputPath</name>
+            <description>path where to store the action set</description>
+        </property>
+    </parameters>
+
+    <start to="deleteoutputpath"/>
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <action name="deleteoutputpath">
+        <fs>
+            <delete path='${outputPath}'/>
+            <mkdir path='${outputPath}'/>
+            <delete path='${workingDir}'/>
+            <mkdir path='${workingDir}'/>
+        </fs>
+        <ok to="get_project_file"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="get_project_file">
+        <java>
+            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
+            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
+            <arg>--fileURL</arg><arg>${projectFileURL}</arg>
+            <arg>--hdfsPath</arg><arg>${workingDir}/projects</arg>
+            <arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProject</arg>
+        </java>
+        <ok to="get_programme_file"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="get_programme_file">
+        <java>
+            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
+            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
+            <arg>--fileURL</arg><arg>${programmeFileURL}</arg>
+            <arg>--hdfsPath</arg><arg>${workingDir}/programme</arg>
+            <arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme</arg>
+        </java>
+        <ok to="get_topic_file"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="get_topic_file">
+        <java>
+            <main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadExcel</main-class>
+            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
+            <arg>--fileURL</arg><arg>${topicFileURL}</arg>
+            <arg>--hdfsPath</arg><arg>${workingDir}/topic</arg>
+            <arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic</arg>
+        </java>
+        <ok to="read_projects"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="read_projects">
+        <java>
+            <main-class>eu.dnetlib.dhp.actionmanager.project.ReadProjectsFromDB</main-class>
+            <arg>--hdfsPath</arg><arg>${workingDir}/dbProjects</arg>
+            <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
+            <arg>--postgresUrl</arg><arg>${postgresURL}</arg>
+            <arg>--postgresUser</arg><arg>${postgresUser}</arg>
+            <arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
+        </java>
+        <ok to="prepare_programme"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="prepare_programme">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PrepareProgramme</name>
+            <class>eu.dnetlib.dhp.actionmanager.project.PrepareProgramme</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--programmePath</arg><arg>${workingDir}/programme</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/preparedProgramme</arg>
+        </spark>
+        <ok to="prepare_project"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="prepare_project">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PrepareProjects</name>
+            <class>eu.dnetlib.dhp.actionmanager.project.PrepareProjects</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--projectPath</arg><arg>${workingDir}/projects</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
+            <arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
+        </spark>
+        <ok to="create_updates"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="create_updates">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>ProjectProgrammeAS</name>
+            <class>eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+            <arg>--projectPath</arg><arg>${workingDir}/preparedProjects</arg>
+            <arg>--programmePath</arg><arg>${workingDir}/preparedProgramme</arg>
+            <arg>--topicPath</arg><arg>${workingDir}/topic</arg>
+            <arg>--outputPath</arg><arg>${outputPath}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>