From 62ff4999e336b1524aaccfa80757baabb0deb634 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 1 Dec 2020 14:30:56 +0100 Subject: [PATCH] added workflow and last step of collection and save --- .../bipfinder/input_actionset_parameter.json | 14 +- .../bipfinder/oozie_app/workflow.xml | 178 +++++++++--------- 2 files changed, 94 insertions(+), 98 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json index ae844a0c9c..67911eef1f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json @@ -16,17 +16,5 @@ "paramLongName": "outputPath", "paramDescription": "the path of the new ActionSet", "paramRequired": true -}, - { - "paramName": "rtn", - "paramLongName": "resultTableName", - "paramDescription": "the path of the new ActionSet", - "paramRequired": true - }, - { - "paramName": "bsp", - "paramLongName": "bipScorePath", - "paramDescription": "the path of the new ActionSet", - "paramRequired": true - } +} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml index c710c8b553..6f74146f18 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml @@ -1,22 +1,17 @@ - + - projectFileURL - the url where to get the projects file + inputPath + the input path of the resources to be extended - programmeFileURL - the url where to get the programme file - - - - topicFileURL - the url where to get the topic file + bipScorePath + the path where to find the bipFinder scores outputPath - path where to store the action set + the path where to store the actionset @@ -31,128 +26,141 @@ - + - - - eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV - --hdfsNameNode${nameNode} - --fileURL${projectFileURL} - --hdfsPath${workingDir}/projects - --classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProject - - - - + + + + + + - - - eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV - --hdfsNameNode${nameNode} - --fileURL${programmeFileURL} - --hdfsPath${workingDir}/programme - --classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme - - - - - - - - eu.dnetlib.dhp.actionmanager.project.utils.ReadExcel - --hdfsNameNode${nameNode} - --fileURL${topicFileURL} - --hdfsPath${workingDir}/topic - --classForNameeu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic - - - - - - - - eu.dnetlib.dhp.actionmanager.project.ReadProjectsFromDB - --hdfsPath${workingDir}/dbProjects - --hdfsNameNode${nameNode} - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} - - - - - - + yarn cluster - PrepareProgramme - eu.dnetlib.dhp.actionmanager.project.PrepareProgramme + Produces the atomic action with the bip finder scores for publications + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --programmePath${workingDir}/programme - --outputPath${workingDir}/preparedProgramme + --inputPath${inputPath}/publication + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/publication + --bipScorePath${bipScorePath} - + - + yarn cluster - PrepareProjects - eu.dnetlib.dhp.actionmanager.project.PrepareProjects + Produces the atomic action with the bip finder scores for datasets + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --projectPath${workingDir}/projects - --outputPath${workingDir}/preparedProjects - --dbProjectPath${workingDir}/dbProjects + --inputPath${inputPath}/dataset + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/otherresearchproduct + --bipScorePath${bipScorePath} - + - + yarn cluster - ProjectProgrammeAS - eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob + Produces the atomic action with the bip finder scores for orp + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --projectPath${workingDir}/preparedProjects - --programmePath${workingDir}/preparedProgramme - --topicPath${workingDir}/topic + --inputPath${inputPath}/otherresearchproduct + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/otherresearchproduct + --bipScorePath${bipScorePath} + + + + + + + + yarn + cluster + Produces the atomic action with the bip finder scores for software + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/software + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/software + --bipScorePath${bipScorePath} + + + + + + + + + + yarn + cluster + saves all the aa produced for the several types of results in the as output path + eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${workingDir} --outputPath${outputPath}