diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties index 7b4bb96cf..08f9b1eac 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties @@ -1,18 +1,16 @@ # The following set of properties are defined in https://support.openaire.eu/projects/openaire/wiki/Hadoop_clusters # and concern the parameterization required for running workflows on the @GARR cluster -dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos -dhp.hadoop.frontend.user.name=ilias.kanellos -dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl -dhp.hadoop.frontend.port.ssh=22 -oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie -jobTracker=yarnRM -nameNode=hdfs://nameservice1 -oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log -maven.executable=mvn -sparkDriverMemory=7G -sparkExecutorMemory=7G -sparkExecutorCores=4 +# --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties --- +# dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos +# dhp.hadoop.frontend.user.name=ilias.kanellos +# dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl +# dhp.hadoop.frontend.port.ssh=22 +# oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie +# jobTracker=yarnRM +# nameNode=hdfs://nameservice1 +# oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log +# maven.executable=mvn # Some memory and driver settings for more demanding tasks sparkHighDriverMemory=20G @@ -21,6 +19,9 @@ sparkNormalDriverMemory=10G sparkHighExecutorMemory=20G sparkNormalExecutorMemory=10G +sparkExecutorCores=4 +sparkShufflePartitions=7680 + # The above is given differently in an example I found online oozie.action.sharelib.for.spark=spark2 oozieActionShareLibForSpark2=spark2 @@ -66,29 +67,26 @@ ramGamma=0.6 convergenceError=0.000000000001 # I think this should be the oozie workflow directory -oozieWorkflowPath=user/ilias.kanellos/workflow_example/ - -# The directory where the workflow data is/should be stored -workflowDataDir=user/ilias.kanellos/ranking_workflow +# oozieWorkflowPath=user/ilias.kanellos/workflow_example/ # Directory where json data containing scores will be output -bipScorePath=${workflowDataDir}/openaire_universe_scores/ +bipScorePath=${workingDir}/openaire_universe_scores/ # Directory where dataframes are checkpointed -checkpointDir=${nameNode}/${workflowDataDir}/check/ +checkpointDir=${nameNode}/${workingDir}/check/ # The directory for the doi-based bip graph -bipGraphFilePath=${nameNode}/${workflowDataDir}/bipdbv8_graph +bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph # The folder from which synonyms of openaire-ids are read # openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/ -openaireDataInput=${/tmp/prod_provision/graph/18_graph_blacklisted} +openaireDataInput=/tmp/prod_provision/graph/18_graph_blacklisted # A folder where we will write the openaire to doi mapping -synonymFolder=${nameNode}/${workflowDataDir}/openaireid_to_dois/ +synonymFolder=${nameNode}/${workingDir}/openaireid_to_dois/ # This will be where we store the openaire graph input. They told us on GARR to use a directory under /data -openaireGraphInputPath=${nameNode}/${workflowDataDir}/openaire_id_graph +openaireGraphInputPath=${nameNode}/${workingDir}/openaire_id_graph # The workflow application path wfAppPath=${nameNode}/${oozieWorkflowPath} @@ -96,8 +94,8 @@ wfAppPath=${nameNode}/${oozieWorkflowPath} oozie.wf.application.path=${wfAppPath} # Path where the final output should be? -actionSetOutputPath=${workflowDataDir}/bip_actionsets/ +actionSetOutputPath=${workingDir}/bip_actionsets/ # The directory to store project impact indicators -projectImpactIndicatorsOutput=${workflowDataDir}/project_indicators +projectImpactIndicatorsOutput=${workingDir}/project_indicators diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index f07a27244..d930ab774 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -46,21 +46,23 @@ create_openaire_ranking_graph.py - --executor-memory 20G --executor-cores 4 --driver-memory 20G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkHighDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireDataInput} ${currentYear} - 7680 + ${sparkShufflePartitions} ${openaireGraphInputPath} @@ -100,18 +102,20 @@ CC.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} - 7680 + ${sparkShufflePartitions} ${wfAppPath}/CC.py#CC.py @@ -141,21 +145,23 @@ TAR.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} ${ramGamma} ${currentYear} RAM - 7680 + ${sparkShufflePartitions} ${checkpointDir} ${wfAppPath}/TAR.py#TAR.py @@ -189,18 +195,20 @@ CC.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} - 7680 + ${sparkShufflePartitions} 3 ${wfAppPath}/CC.py#CC.py @@ -244,21 +252,23 @@ PageRank.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} ${pageRankAlpha} ${convergenceError} ${checkpointDir} - 7680 + ${sparkShufflePartitions} dfs ${wfAppPath}/PageRank.py#PageRank.py @@ -289,14 +299,16 @@ AttRank.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} ${attrankAlpha} @@ -308,7 +320,7 @@ ${convergenceError} ${checkpointDir} - 7680 + ${sparkShufflePartitions} dfs ${wfAppPath}/AttRank.py#AttRank.py @@ -339,7 +351,7 @@ get_ranking_files.sh - /${workflowDataDir} + /${workingDir} ${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh @@ -381,24 +393,26 @@ format_ranking_results.py - --executor-memory 10G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkNormalExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + json-5-way - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} - 7680 + ${sparkShufflePartitions} openaire @@ -429,24 +443,26 @@ format_ranking_results.py - --executor-memory 10G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkNormalExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + zenodo - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} - 7680 + ${sparkShufflePartitions} openaire @@ -484,14 +500,16 @@ map_openaire_ids_to_dois.py - --executor-memory 18G --executor-cores 4 --driver-memory 15G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkHighDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireDataInput} @@ -526,24 +544,26 @@ map_scores_to_dois.py - --executor-memory 18G --executor-cores 4 --driver-memory 15G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkHighDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${synonymFolder} - 7680 + ${sparkShufflePartitions} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} ${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py @@ -576,9 +596,9 @@ eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar - --executor-memory=${sparkExecutorMemory} + --executor-memory=${sparkNormalExecutorMemory} --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} + --driver-memory=${sparkNormalDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -609,14 +629,16 @@ projects_impact.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + @@ -624,13 +646,13 @@ ${openaireDataInput}/relations - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} - 7680 + ${sparkShufflePartitions} ${projectImpactIndicatorsOutput} @@ -654,9 +676,9 @@ eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar - --executor-memory=${sparkExecutorMemory} + --executor-memory=${sparkNormalExecutorMemory} --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} + --driver-memory=${sparkNormalDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}