Refactor hive job to populate tables using SQL

2024-12-05 15:03:49 +01:00 · 2024-12-05 15:03:49 +01:00 · f10febb565
parent dd6ed31383
commit f10febb565
7 changed files with 100 additions and 265 deletions
--- a/dhp-shade-package/dependency-reduced-pom.xml
+++ b/dhp-shade-package/dependency-reduced-pom.xml
@ -95,6 +95,10 @@
          <artifactId>byte-buddy-agent</artifactId>
          <groupId>net.bytebuddy</groupId>
        </exclusion>
+        <exclusion>
+          <artifactId>objenesis</artifactId>
+          <groupId>org.objenesis</groupId>
+        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
@ -102,6 +106,12 @@
      <artifactId>mockito-junit-jupiter</artifactId>
      <version>3.3.3</version>
      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>junit-jupiter-api</artifactId>
+          <groupId>org.junit.jupiter</groupId>
+        </exclusion>
+      </exclusions>
    </dependency>
  </dependencies>
  <distributionManagement>
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java
@ -160,7 +160,7 @@ public class ORCIDExtractor extends Thread {
 			}
 		} finally {
 			for (SequenceFile.Writer k : fileMap.values()) {
-					log.info("Thread {}: Completed processed {} items", id, extractedItem);
+				log.info("Thread {}: Completed processed {} items", id, extractedItem);
 				k.hflush();
 				k.close();
 			}
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -88,6 +88,12 @@
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-common</artifactId>
            <version>${project.version}</version>
+            <exclusions>
+                <exclusion>
+                    <artifactId>log4j</artifactId>
+                    <groupId>log4j</groupId>
+                </exclusion>
+            </exclusions>
        </dependency>

        <dependency>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml
@ -23,4 +23,8 @@
        <name>hiveDbName</name>
        <value>openaire</value>
    </property>
+    <property>
+        <name>sparkSqlWarehouseDir</name>
+        <value>/user/hive/warehouse</value>
+    </property>
 </configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql
@ -0,0 +1,49 @@
+
+CREATE TEMPORARY VIEW datasource USING json OPTIONS ( path "${inputPath}/datasource"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.datasource
+    USING parquet
+    CLUSTERED BY ( id ) INTO 200 BUCKETS
+    AS SELECT * FROM datasource DISTRIBUTE BY id;  /*EOS*/
+
+CREATE TEMPORARY VIEW dataset USING json OPTIONS ( path "${inputPath}/dataset"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.dataset
+    USING parquet
+    CLUSTERED BY ( id ) INTO 4000 BUCKETS
+    AS SELECT * FROM dataset DISTRIBUTE BY id;  /*EOS*/
+
+CREATE TEMPORARY VIEW organization USING json OPTIONS ( path "${inputPath}/organization"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.organization
+    USING parquet
+    CLUSTERED BY ( id ) INTO 1000 BUCKETS
+    AS SELECT * FROM organization DISTRIBUTE BY id;  /*EOS*/
+
+CREATE TEMPORARY VIEW otherresearchproduct USING json OPTIONS ( path "${inputPath}/otherresearchproduct"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.otherresearchproduct
+    USING parquet
+    CLUSTERED BY ( id ) INTO 8000 BUCKETS
+    AS SELECT * FROM otherresearchproduct DISTRIBUTE BY id;  /*EOS*/
+
+CREATE TEMPORARY VIEW project USING json OPTIONS ( path "${inputPath}/project"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.project
+    USING parquet
+    CLUSTERED BY ( id ) INTO 1000 BUCKETS
+    AS SELECT * FROM project DISTRIBUTE BY id;  /*EOS*/
+
+CREATE TEMPORARY VIEW publication USING json OPTIONS ( path "${inputPath}/publication"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.publication
+    USING parquet
+    CLUSTERED BY ( id ) INTO 10000 BUCKETS
+    AS SELECT * FROM publication DISTRIBUTE BY id;  /*EOS*/
+
+CREATE TEMPORARY VIEW relation USING json OPTIONS ( path "${inputPath}/relation"); /*EOS*/
+DROP TABLE IF EXISTS ${hiveDbName}.relation; /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.relation
+    USING parquet
+    PARTITIONED BY ( relClass )
+    AS SELECT * FROM relation DISTRIBUTE BY source,target; /*EOS*/
+
+CREATE TEMPORARY VIEW software USING json OPTIONS ( path "${inputPath}/software"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.software
+    USING parquet
+    CLUSTERED BY ( id ) INTO 1000 BUCKETS
+    AS SELECT * FROM software DISTRIBUTE BY source;  /*EOS*/
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -51,6 +51,22 @@
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
+        <!-- General oozie workflow properties -->
+        <property>
+            <name>sparkClusterOpts</name>
+            <value>--conf spark.network.timeout=600 --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory</value>
+            <description>spark cluster-wide options</description>
+        </property>
+        <property>
+            <name>sparkResourceOpts</name>
+            <value>--executor-memory=3G --conf spark.executor.memoryOverhead=3G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
+            <description>spark resource options</description>
+        </property>
+        <property>
+            <name>sparkApplicationOpts</name>
+            <value>--conf spark.sql.shuffle.partitions=3840</value>
+            <description>spark resource options</description>
+        </property>
    </parameters>

    <global>
@ -90,285 +106,32 @@
            <script>lib/scripts/reset_db.sql</script>
            <param>hiveDbName=${hiveDbName}</param>
        </hive2>
-        <ok to="fork_import"/>
+        <ok to="import_graph"/>
        <error to="Kill"/>
    </action>

-    <fork name="fork_import">
-        <path start="import_publication"/>
-        <path start="import_dataset"/>
-        <path start="import_orp"/>
-        <path start="import_software"/>
-        <path start="import_datasource"/>
-        <path start="import_organization"/>
-        <path start="import_project"/>
-        <path start="import_person"/>
-        <path start="import_relation"/>
-    </fork>
-
-    <action name="import_publication">
+    <action name="import_graph">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Import table publication</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
+            <name>Import graph tables into Hive</name>
+            <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=10000
+                ${sparkClusterOpts}
+                ${sparkResourceOpts}
+                ${sparkApplicationOpts}
            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>10000</arg>
+            <arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql</arg>
+            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
+            <arg>--inputPath</arg><arg>${inputPath}</arg>
        </spark>
-        <ok to="join_import"/>
+        <ok to="PostProcessing"/>
        <error to="Kill"/>
    </action>

-    <action name="import_dataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Import table dataset</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=4000
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>8000</arg>
-        </spark>
-        <ok to="join_import"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="import_orp">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Import table otherresearchproduct</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=8000
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>3000</arg>
-        </spark>
-        <ok to="join_import"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="import_software">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Import table software</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=1000
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>1000</arg>
-        </spark>
-        <ok to="join_import"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="import_datasource">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Import table datasource</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=200
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>200</arg>
-        </spark>
-        <ok to="join_import"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="import_organization">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Import table organization</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=1000
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/organization</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>1000</arg>
-        </spark>
-        <ok to="join_import"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="import_project">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Import table project</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=1000
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/project</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>1000</arg>
-        </spark>
-        <ok to="join_import"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="import_person">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Import table person</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=1000
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/person</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>1000</arg>
-        </spark>
-        <ok to="join_import"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="import_relation">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Import table relation</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=15000
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${inputPath}/relation</arg>
-            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
-            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
-            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>15000</arg>
-        </spark>
-        <ok to="join_import"/>
-        <error to="Kill"/>
-    </action>
-
-    <join name="join_import" to="PostProcessing"/>
-
    <action name="PostProcessing">
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <configuration>
--- a/pom.xml
+++ b/pom.xml
@ -988,6 +988,9 @@
                </dependencies>
            </dependencyManagement>
        </profile>
+        <profile>
+            <id>spark-24</id>
+        </profile>
        <profile>
            <id>spark-34</id>
            <properties>