[graph provision] added person to the graph2hive workflow

2024-08-05 09:35:07 +02:00 · 2024-08-05 09:35:07 +02:00 · 0bf76f2a34
parent 975d44cac7
commit 0bf76f2a34
1 changed files with 30 additions and 0 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@ -102,6 +102,7 @@
        <path start="import_datasource"/>
        <path start="import_organization"/>
        <path start="import_project"/>
+        <path start="import_person"/>
        <path start="import_relation"/>
    </fork>

@ -308,6 +309,35 @@
        <error to="Kill"/>
    </action>

+    <action name="import_person">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Import table person</name>
+            <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+                --conf spark.sql.shuffle.partitions=1000
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/person</arg>
+            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
+            <arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
+            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
+            <arg>--numPartitions</arg><arg>1000</arg>
+        </spark>
+        <ok to="join_import"/>
+        <error to="Kill"/>
+    </action>
+
    <action name="import_relation">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>