From f9ee343fc081ab8715d39305ac44ba79e2038ec7 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 27 Apr 2020 10:48:31 +0200 Subject: [PATCH] new parametrized workflow with preparation steps and new parameter input files --- .../input_communitytoresult_parameters.json | 6 + ..._preparecommunitytoresult2_parameters.json | 32 +- ...t_preparecommunitytoresult_parameters.json | 29 +- .../oozie_app/config-default.xml | 36 ++ .../oozie_app/workflow.xml | 315 ++++++++++++++++-- 5 files changed, 338 insertions(+), 80 deletions(-) diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json index 27bb097f9d..2baec0e687 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json +++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json @@ -35,6 +35,12 @@ "paramLongName": "preparedInfoPath", "paramDescription": "the path where prepared info have been stored", "paramRequired": true + }, + { + "paramName":"test", + "paramLongName":"isTest", + "paramDescription": "true if it is executing a test", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json index f61db6f346..3ba3c8e9c7 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json +++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json @@ -1,34 +1,10 @@ [ - { - "paramName":"is", - "paramLongName":"isLookupUrl", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, { "paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true }, - { - "paramName":"as", - "paramLongName":"allowedsemrels", - "paramDescription": "the allowed semantic relations for propagation", - "paramRequired": true - }, - { - "paramName":"h", - "paramLongName":"hive_metastore_uris", - "paramDescription": "the hive metastore uris", - "paramRequired": true - }, - { - "paramName":"sg", - "paramLongName":"saveGraph", - "paramDescription": "true if the new version of the graph must be saved", - "paramRequired": false - }, { "paramName": "ssm", "paramLongName": "isSparkSessionManaged", @@ -40,11 +16,5 @@ "paramLongName": "outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true - }, - { - "paramName":"tn", - "paramLongName":"resultTableName", - "paramDescription": "the name of the result table we are currently working on", - "paramRequired": true - } + } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json index 90b5974ea4..a5dfefc0a0 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json +++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json @@ -5,12 +5,6 @@ "paramDescription": "URL of the isLookUp Service", "paramRequired": true }, - { - "paramName":"mt", - "paramLongName":"master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, { "paramName":"s", "paramLongName":"sourcePath", @@ -30,16 +24,21 @@ "paramRequired": true }, { - "paramName":"wu", - "paramLongName":"writeUpdate", - "paramDescription": "true if the update must be writte. No double check if information is already present", + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", "paramRequired": true }, { - "paramName":"sg", - "paramLongName":"saveGraph", - "paramDescription": "true if the new version of the graph must be saved", - "paramRequired": true - } - + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml index ea3a4d9223..2744ea92ba 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml @@ -19,4 +19,40 @@ hive_metastore_uris thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + + spark2MaxExecutors + 50 + \ No newline at end of file diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml index 4312ec0682..d320bc9eb4 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml @@ -8,54 +8,301 @@ allowedsemrels the semantic relationships allowed for propagation - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - isLookupUrl the isLookup service endpoint - + - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/relation + ${nameNode}/${workingDir}/projecttoresult_propagation/relation + + + + - - - ${jobTracker} - ${nameNode} - yarn-cluster + + + + + + + + + + yarn + cluster + ResultToCommunitySemRel-PreparePhase1-Publications + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --allowedsemrels${allowedsemrels} + --isLookupUrl${isLookupUrl} + + + + + + + yarn + cluster + ResultToCommunitySemRel-PreparePhase1-Dataset + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --allowedsemrels${allowedsemrels} + --isLookupUrl${isLookupUrl} + + + + + + + yarn + cluster + ResultToCommunitySemRel-PreparePhase1-ORP + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --allowedsemrels${allowedsemrels} + --isLookupUrl${isLookupUrl} + + + + + + + yarn + cluster + ResultToCommunitySemRel-PreparePhase1-Software + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --allowedsemrels${allowedsemrels} + --isLookupUrl${isLookupUrl} + + + + + + + + + + yarn cluster - ResultToCommunitySemRelPropagation - eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob + ResultToCommunityEmRelPropagation-PreparePhase2 + eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep2 dhp-propagation-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" - --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - -mt yarn-cluster - --sourcePath${sourcePath} - --allowedsemrels${allowedsemrels} - --hive_metastore_uris${hive_metastore_uris} - --isLookupUrl${isLookupUrl} + --sourcePath${workingDir}/preparedInfo/targetCommunityAssoc + --outputPath${workingDir}/preparedInfo/mergedCommunityAssoc - + + - + + + + + + + + + + + + + yarn + cluster + Result2CommunitySemRelPropagation-Publication + eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob4 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${sourcePath}/publication + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/publication + + + + + + + yarn + cluster + Result2CommunitySemRelPropagation-Dataset + eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob4 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${sourcePath}/dataset + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/dataset + + + + + + + yarn + cluster + Result2CommunitySemRelPropagation-ORP + eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob4 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${sourcePath}/otherresearchproduct + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/otherresearchproduct + + + + + + + yarn + cluster + Result2CommunitySemRelPropagation-Software + eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob4 + dhp-propagation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${sourcePath}/software + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/software + + + + + + \ No newline at end of file