forked from D-Net/dnet-hadoop
Changes required to run cleaning workflow with spark 3.4
This commit is contained in:
parent
759e060451
commit
ffc7488257
|
@ -128,6 +128,12 @@
|
|||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-rmi-api</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>log4j</artifactId>
|
||||
<groupId>log4j</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
|
@ -11,6 +11,10 @@
|
|||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
|
|
|
@ -81,6 +81,22 @@
|
|||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
<!-- General oozie workflow properties -->
|
||||
<property>
|
||||
<name>sparkClusterOpts</name>
|
||||
<value>--conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory</value>
|
||||
<description>spark cluster-wide options</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkResourceOpts</name>
|
||||
<value>--executor-memory=8G --conf spark.executor.memoryOverhead=6G --executor-cores=6 --driver-memory=9G --driver-cores=4</value>
|
||||
<description>spark resource options</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkApplicationOpts</name>
|
||||
<value>--conf spark.sql.shuffle.partitions=1000</value>
|
||||
<description>spark resource options</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="prepare_info"/>
|
||||
|
@ -102,13 +118,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.GetDatasourceFromCountry</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=10000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}</arg>
|
||||
|
@ -154,15 +166,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=15000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
||||
|
@ -190,15 +196,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
||||
|
@ -226,15 +226,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
||||
|
@ -262,15 +256,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
||||
|
@ -298,15 +286,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=1000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
||||
|
@ -334,15 +316,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=1000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
||||
|
@ -370,15 +346,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
||||
|
@ -406,15 +376,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/person</arg>
|
||||
|
@ -442,15 +406,9 @@
|
|||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.autoBroadcastJoinThreshold=-1
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--conf spark.sql.shuffle.partitions=20000
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
||||
|
|
Loading…
Reference in New Issue