Refactor hive job to populate tables using SQL

2024-12-05 15:03:49 +01:00 · 2024-12-05 15:03:49 +01:00 · f10febb565
parent dd6ed31383
commit f10febb565
7 changed files with 100 additions and 265 deletions
--- a/dhp-shade-package/dependency-reduced-pom.xml
+++ b/dhp-shade-package/dependency-reduced-pom.xml
@ -95,6 +95,10 @@
          <artifactId>byte-buddy-agent</artifactId>
          <groupId>net.bytebuddy</groupId>
        </exclusion>
        <exclusion>
          <artifactId>objenesis</artifactId>
          <groupId>org.objenesis</groupId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
@ -102,6 +106,12 @@
      <artifactId>mockito-junit-jupiter</artifactId>
      <version>3.3.3</version>
      <scope>test</scope>
      <exclusions>
        <exclusion>
          <artifactId>junit-jupiter-api</artifactId>
          <groupId>org.junit.jupiter</groupId>
        </exclusion>
      </exclusions>
    </dependency>
  </dependencies>
  <distributionManagement>
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -88,6 +88,12 @@
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-common</artifactId>
            <version>${project.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>log4j</artifactId>
                    <groupId>log4j</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml
@ -23,4 +23,8 @@
        <name>hiveDbName</name>
        <value>openaire</value>
    </property>
    <property>
        <name>sparkSqlWarehouseDir</name>
        <value>/user/hive/warehouse</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql
@ -0,0 +1,49 @@
 CREATE TEMPORARY VIEW datasource USING json OPTIONS ( path "${inputPath}/datasource"); /*EOS*/
 CREATE TABLE IF NOT EXISTS ${hiveDbName}.datasource
    USING parquet
    CLUSTERED BY ( id ) INTO 200 BUCKETS
    AS SELECT * FROM datasource DISTRIBUTE BY id;  /*EOS*/
 CREATE TEMPORARY VIEW dataset USING json OPTIONS ( path "${inputPath}/dataset"); /*EOS*/
 CREATE TABLE IF NOT EXISTS ${hiveDbName}.dataset
    USING parquet
    CLUSTERED BY ( id ) INTO 4000 BUCKETS
    AS SELECT * FROM dataset DISTRIBUTE BY id;  /*EOS*/
 CREATE TEMPORARY VIEW organization USING json OPTIONS ( path "${inputPath}/organization"); /*EOS*/
 CREATE TABLE IF NOT EXISTS ${hiveDbName}.organization
    USING parquet
    CLUSTERED BY ( id ) INTO 1000 BUCKETS
    AS SELECT * FROM organization DISTRIBUTE BY id;  /*EOS*/
 CREATE TEMPORARY VIEW otherresearchproduct USING json OPTIONS ( path "${inputPath}/otherresearchproduct"); /*EOS*/
 CREATE TABLE IF NOT EXISTS ${hiveDbName}.otherresearchproduct
    USING parquet
    CLUSTERED BY ( id ) INTO 8000 BUCKETS
    AS SELECT * FROM otherresearchproduct DISTRIBUTE BY id;  /*EOS*/
 CREATE TEMPORARY VIEW project USING json OPTIONS ( path "${inputPath}/project"); /*EOS*/
 CREATE TABLE IF NOT EXISTS ${hiveDbName}.project
    USING parquet
    CLUSTERED BY ( id ) INTO 1000 BUCKETS
    AS SELECT * FROM project DISTRIBUTE BY id;  /*EOS*/
 CREATE TEMPORARY VIEW publication USING json OPTIONS ( path "${inputPath}/publication"); /*EOS*/
 CREATE TABLE IF NOT EXISTS ${hiveDbName}.publication
    USING parquet
    CLUSTERED BY ( id ) INTO 10000 BUCKETS
    AS SELECT * FROM publication DISTRIBUTE BY id;  /*EOS*/
 CREATE TEMPORARY VIEW relation USING json OPTIONS ( path "${inputPath}/relation"); /*EOS*/
 DROP TABLE IF EXISTS ${hiveDbName}.relation; /*EOS*/
 CREATE TABLE IF NOT EXISTS ${hiveDbName}.relation
    USING parquet
    PARTITIONED BY ( relClass )
    AS SELECT * FROM relation DISTRIBUTE BY source,target; /*EOS*/
 CREATE TEMPORARY VIEW software USING json OPTIONS ( path "${inputPath}/software"); /*EOS*/
 CREATE TABLE IF NOT EXISTS ${hiveDbName}.software
    USING parquet
    CLUSTERED BY ( id ) INTO 1000 BUCKETS
    AS SELECT * FROM software DISTRIBUTE BY source;  /*EOS*/
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -51,6 +51,22 @@
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
        <!-- General oozie workflow properties -->
        <property>
            <name>sparkClusterOpts</name>
            <value>--conf spark.network.timeout=600 --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory</value>
            <description>spark cluster-wide options</description>
        </property>
        <property>
            <name>sparkResourceOpts</name>
            <value>--executor-memory=3G --conf spark.executor.memoryOverhead=3G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
            <description>spark resource options</description>
        </property>
        <property>
            <name>sparkApplicationOpts</name>
            <value>--conf spark.sql.shuffle.partitions=3840</value>
            <description>spark resource options</description>
        </property>
    </parameters>
    <global>
@ -90,285 +106,32 @@
            <script>lib/scripts/reset_db.sql</script>
            <param>hiveDbName=${hiveDbName}</param>
        </hive2>
-        <ok to="fork_import"/>
+        <ok to="import_graph"/>
        <error to="Kill"/>
    </action>
-    <fork name="fork_import">
+    <action name="import_graph">
        <path start="import_publication"/>
        <path start="import_dataset"/>
        <path start="import_orp"/>
        <path start="import_software"/>
        <path start="import_datasource"/>
        <path start="import_organization"/>
        <path start="import_project"/>
        <path start="import_person"/>
        <path start="import_relation"/>
    </fork>
    <action name="import_publication">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Import table publication</name>
+            <name>Import graph tables into Hive</name>
-            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
+            <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-                --conf spark.sql.shuffle.partitions=10000
+                ${sparkClusterOpts}
                ${sparkResourceOpts}
                ${sparkApplicationOpts}
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
-            <arg>--numPartitions</arg><arg>10000</arg>
+            <arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--inputPath</arg><arg>${inputPath}</arg>
        </spark>
-        <ok to="join_import"/>
+        <ok to="PostProcessing"/>
        <error to="Kill"/>
    </action>
    <action name="import_dataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table dataset</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=4000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>8000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <action name="import_orp">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table otherresearchproduct</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=8000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>3000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <action name="import_software">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table software</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <action name="import_datasource">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table datasource</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=200
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>200</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <action name="import_organization">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table organization</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/organization</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <action name="import_project">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table project</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/project</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <action name="import_person">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table person</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=1000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/person</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>1000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <action name="import_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Import table relation</name>
            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                --conf spark.sql.shuffle.partitions=15000
            </spark-opts>
            <arg>--inputPath</arg><arg>${inputPath}/relation</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
            <arg>--numPartitions</arg><arg>15000</arg>
        </spark>
        <ok to="join_import"/>
        <error to="Kill"/>
    </action>
    <join name="join_import" to="PostProcessing"/>
    <action name="PostProcessing">
        <hive2 xmlns="uri:oozie:hive2-action:0.1">
            <configuration>
--- a/pom.xml
+++ b/pom.xml
@ -988,6 +988,9 @@
                </dependencies>
            </dependencyManagement>
        </profile>
        <profile>
            <id>spark-24</id>
        </profile>
        <profile>
            <id>spark-34</id>
            <properties>