From 3f4b579e7f4ef00906a31da85c596be41c8c037e Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 14 Apr 2020 16:49:24 +0200 Subject: [PATCH] new workflow. It is composed of four steps. The first removes the directory where to store the results. The second copies the relation to the new locatio, the third id the preparation phase and then the actual propagation --- .../projecttoresult/oozie_app/workflow.xml | 108 ++++++++++++++++-- 1 file changed, 100 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml index 4c073f0a2..1d15391ab 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml @@ -20,14 +20,96 @@ sparkExecutorCores number of cores used by single executor + + writeUpdate + writes the information found for the update. No double check done if the information is already present + + + saveGraph + writes new version of the graph after the propagation step + - - + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${workingDir}/projecttoresult_propagation/relation + + + + + + + + yarn + cluster + PrepareProjectResultsAssociation + eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/relation + --allowedsemrels${allowedsemrels} + --hive_metastore_uris${hive_metastore_uris} + --potentialUpdatePath${workingDir}/projecttoresult_propagation/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/projecttoresult_propagation/preparedInfo/alreadyLinked + + + + + + + + yarn + cluster + ProjectToResultPropagation + eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob3 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath}/relation + --writeUpdate${writeUpdate} + --saveGraph${saveGraph} + --hive_metastore_uris${hive_metastore_uris} + --outputPath${workingDir}/projecttoresult_propagation/relation + --potentialUpdatePath${workingDir}/projecttoresult_propagation/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/projecttoresult_propagation/preparedInfo/alreadyLinked + + + + + ${jobTracker} @@ -35,17 +117,27 @@ yarn-cluster cluster ProjectToResultPropagation - eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob2 dhp-propagation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" - --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - -mt yarn-cluster --sourcePath${sourcePath} --allowedsemrels${allowedsemrels} + --hive_metastore_uris${hive_metastore_uris} + --writeUpdate${writeUpdate} + --saveGraph${saveGraph} + --potentialUpdatePath${workingDir}/projecttoresult_propagation/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/projecttoresult_propagation/preparedInfo/alreadyLinked