Reusable RunSQLSparkJob for executing SQL in Spark through Oozie Spark Actions

Implements pivots table update oozie workflow
2024-01-15 00:08:07 +01:00 · 2024-01-15 00:08:07 +01:00 · 21a14fcd80
parent c67467723b
commit 21a14fcd80
5 changed files with 278 additions and 0 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java
@ -0,0 +1,75 @@
+
+package eu.dnetlib.dhp.oozie;
+
+import com.google.common.io.Resources;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import org.apache.commons.lang3.time.DurationFormatUtils;
+import org.apache.commons.text.StringSubstitutor;
+import org.apache.spark.SparkConf;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
+
+public class RunSQLSparkJob {
+	private static final Logger log = LoggerFactory.getLogger(RunSQLSparkJob.class);
+
+	private final ArgumentApplicationParser parser;
+
+	public RunSQLSparkJob(ArgumentApplicationParser parser) {
+		this.parser = parser;
+	}
+
+	public static void main(String[] args) throws Exception {
+
+		Map<String, String> params = new HashMap<>();
+		for (int i = 0; i < args.length - 1; i++) {
+			if (args[i].startsWith("--")) {
+				params.put(args[i].substring(2), args[++i]);
+			}
+		}
+
+		/*
+		 * String jsonConfiguration = IOUtils .toString( Objects .requireNonNull( RunSQLSparkJob.class
+		 * .getResourceAsStream( "/eu/dnetlib/dhp/oozie/run_sql_parameters.json"))); final ArgumentApplicationParser
+		 * parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args);
+		 */
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(params.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		URL url = com.google.common.io.Resources.getResource(params.get("sql"));
+		String raw_sql = Resources.toString(url, StandardCharsets.UTF_8);
+
+		String sql = StringSubstitutor.replace(raw_sql, params);
+		log.info("sql: {}", sql);
+
+		SparkConf conf = new SparkConf();
+		conf.set("hive.metastore.uris", params.get("hiveMetastoreUris"));
+
+		runWithSparkHiveSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
+					log.info("executing: {}", statement);
+					long startTime = System.currentTimeMillis();
+					spark.sql(statement).show();
+					log
+						.info(
+							"executed in {}",
+							DurationFormatUtils.formatDuration(System.currentTimeMillis() - startTime, "HH:mm:ss.S"));
+				}
+			});
+	}
+
+}
--- a/dhp-common/src/main/resources/eu/dnetlib/dhp/oozie/run_sql_parameters.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/oozie/run_sql_parameters.json
@ -0,0 +1,20 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "hmu",
+    "paramLongName": "hiveMetastoreUris",
+    "paramDescription": "the hive metastore uris",
+    "paramRequired": true
+  },
+  {
+    "paramName": "sql",
+    "paramLongName": "sql",
+    "paramDescription": "sql script to execute",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/pivothistory/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/pivothistory/oozie_app/config-default.xml
@ -0,0 +1,26 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hiveMetastoreUris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>sparkSqlWarehouseDir</name>
+        <value>/user/hive/warehouse</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/pivothistory/oozie_app/sql.sql
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/pivothistory/oozie_app/sql.sql
@ -0,0 +1,62 @@
+
+CREATE TABLE `${pivot_history_db}`.`dataset_new` STORED AS PARQUET AS
+WITH pivots (
+    SELECT property.value AS id, '${new_graph_date}' AS usedIn  FROM `${new_graph_db}`.`relation`
+    LEFT SEMI JOIN `${new_graph_db}`.`dataset` ON relation.source = dataset.id
+    LATERAL VIEW EXPLODE(properties) AS property WHERE relClass = 'isMergedIn' AND property.key = 'pivot'
+UNION
+    SELECT id, usedIn FROM `${pivot_history_db}`.`dataset` LATERAL VIEW EXPLODE(usages) AS usedIn
+)
+SELECT id, min(usedIn) as firstUsage, max(usedIn) as lastUsage, collect_set(usedIn) as usages
+    FROM pivots
+    GROUP BY id; /*EOS*/
+CREATE TABLE `${pivot_history_db}`.`publication_new` STORED AS PARQUET AS
+WITH pivots (
+    SELECT property.value AS id, '${new_graph_date}' AS usedIn  FROM `${new_graph_db}`.`relation`
+    LEFT SEMI JOIN `${new_graph_db}`.`publication` ON relation.source = publication.id
+    LATERAL VIEW EXPLODE(properties) AS property WHERE relClass = 'isMergedIn' AND property.key = 'pivot'
+UNION
+    SELECT id, usedIn FROM `${pivot_history_db}`.`publication` LATERAL VIEW EXPLODE(usages) AS usedIn
+)
+SELECT id, min(usedIn) as firstUsage, max(usedIn) as lastUsage, collect_set(usedIn) as usages
+    FROM pivots
+    GROUP BY id; /*EOS*/
+CREATE TABLE `${pivot_history_db}`.`software_new` STORED AS PARQUET AS
+WITH pivots (
+    SELECT property.value AS id, '${new_graph_date}' AS usedIn  FROM `${new_graph_db}`.`relation`
+    LEFT SEMI JOIN `${new_graph_db}`.`software` ON relation.source = software.id
+    LATERAL VIEW EXPLODE(properties) AS property WHERE relClass = 'isMergedIn' AND property.key = 'pivot'
+UNION
+    SELECT id, usedIn FROM `${pivot_history_db}`.`software` LATERAL VIEW EXPLODE(usages) AS usedIn
+)
+SELECT id, min(usedIn) as firstUsage, max(usedIn) as lastUsage, collect_set(usedIn) as usages
+    FROM pivots
+    GROUP BY id; /*EOS*/
+CREATE TABLE `${pivot_history_db}`.`otherresearchproduct_new` STORED AS PARQUET AS
+WITH pivots (
+    SELECT property.value AS id, '${new_graph_date}' AS usedIn  FROM `${new_graph_db}`.`relation`
+     LEFT SEMI JOIN `${new_graph_db}`.`otherresearchproduct` ON relation.source = otherresearchproduct.id
+     LATERAL VIEW EXPLODE(properties) AS property WHERE relClass = 'isMergedIn' AND property.key = 'pivot'
+UNION
+    SELECT id, usedIn FROM `${pivot_history_db}`.`otherresearchproduct` LATERAL VIEW EXPLODE(usages) AS usedIn
+)
+SELECT id, min(usedIn) as firstUsage, max(usedIn) as lastUsage, collect_set(usedIn) as usages
+    FROM pivots
+    GROUP BY id; /*EOS*/
+
+
+DROP TABLE IF EXISTS `${pivot_history_db}`.`dataset_old`; /*EOS*/
+ALTER TABLE `${pivot_history_db}`.`dataset` RENAME TO `${pivot_history_db}`.`dataset_old`; /*EOS*/
+ALTER TABLE `${pivot_history_db}`.`dataset_new` RENAME TO `${pivot_history_db}`.`dataset`; /*EOS*/
+
+DROP TABLE IF EXISTS `${pivot_history_db}`.`publication_old`; /*EOS*/
+ALTER TABLE `${pivot_history_db}`.`publication` RENAME TO `${pivot_history_db}`.`publication_old`; /*EOS*/
+ALTER TABLE `${pivot_history_db}`.`publication_new` RENAME TO `${pivot_history_db}`.`publication`; /*EOS*/
+
+DROP TABLE IF EXISTS `${pivot_history_db}`.`software_old`; /*EOS*/
+ALTER TABLE `${pivot_history_db}`.`software` RENAME TO `${pivot_history_db}`.`software_old`; /*EOS*/
+ALTER TABLE `${pivot_history_db}`.`software_new` RENAME TO `${pivot_history_db}`.`software`; /*EOS*/
+
+DROP TABLE IF EXISTS `${pivot_history_db}`.`otherresearchproduct_old`; /*EOS*/
+ALTER TABLE `${pivot_history_db}`.`otherresearchproduct` RENAME TO `${pivot_history_db}`.`otherresearchproduct_old`; /*EOS*/
+ALTER TABLE `${pivot_history_db}`.`otherresearchproduct_new` RENAME TO `${pivot_history_db}`.`otherresearchproduct`; /*EOS*/
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/pivothistory/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/pivothistory/oozie_app/workflow.xml
@ -0,0 +1,95 @@
+<workflow-app name="Update pivot history" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <!-- properties used in SQL -->
+        <property>
+            <name>pivot_history_db</name>
+            <!-- <value>openaire_beta_pivots_test</value> -->
+            <description>Pivot history DB on hive</description>
+        </property>
+        <property>
+            <name>new_graph_db</name>
+            <!--<value>openaire_beta_20231208</value> -->
+            <description>New graph DB on hive</description>
+    </property>
+    <property>
+        <name>new_graph_date</name>
+        <!-- <value>20231208</value> -->
+        <description>Creation date of new graph db</description>
+    </property>
+
+    <!-- RunSQLSparkJob properties -->
+        <property>
+            <name>hiveMetastoreUris</name>
+            <description>hive server metastore URIs</description>
+        </property>
+        <property>
+            <name>sparkSqlWarehouseDir</name>
+        </property>
+        <!-- General oozie workflow properties -->
+        <property>
+            <name>sparkClusterOpts</name>
+            <value>--conf spark.network.timeout=600 --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory</value>
+            <description>spark cluster-wide options</description>
+        </property>
+        <property>
+            <name>sparkResourceOpts</name>
+            <value>--executor-memory=3G --conf spark.executor.memoryOverhead=3G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
+            <description>spark resource options</description>
+        </property>
+        <property>
+            <name>sparkApplicationOpts</name>
+            <value>--conf spark.sql.shuffle.partitions=3840</value>
+            <description>spark resource options</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="UpgradePivotHistory"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="UpgradePivotHistory">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Upgrade Pivot History</name>
+            <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
+            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
+            <spark-opts>
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                ${sparkClusterOpts}
+                ${sparkResourceOpts}
+                ${sparkApplicationOpts}
+            </spark-opts>
+            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
+            <arg>--sql</arg><arg>eu/dnetlib/dhp/oa/dedup/pivothistory/oozie_app/sql.sql</arg>
+            <arg>--pivot_history_db</arg><arg>${pivot_history_db}</arg>
+            <arg>--new_graph_db</arg><arg>${new_graph_db}</arg>
+            <arg>--new_graph_date</arg><arg>${new_graph_date}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>