From a089db18f1a6f18790123800d69b53dec88f3998 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 9 Jun 2020 15:39:38 +0200 Subject: [PATCH] workflow and parameters to exucute the dump --- .../dhp/oa/graph/dump/input_parameters.json | 30 +- .../dhp/oa/graph/dump/oozie_app/workflow.xml | 373 +++++++++++++----- .../graph/dump/project_input_parameters.json | 20 +- .../dump/project_prepare_parameters.json | 28 +- .../dhp/oa/graph/dump/split_parameters.json | 12 - 5 files changed, 314 insertions(+), 149 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json index 82714d973..04f3c9e1b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json @@ -1,4 +1,11 @@ + [ + { + "paramName":"is", + "paramLongName":"isLookUpUrl", + "paramDescription": "URL of the isLookUp Service", + "paramRequired": true + }, { "paramName":"s", "paramLongName":"sourcePath", @@ -16,5 +23,26 @@ "paramLongName": "isSparkSessionManaged", "paramDescription": "true if the spark session is managed, false otherwise", "paramRequired": false + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName":"dn", + "paramLongName":"dumpTableName", + "paramDescription": "the name of the corresondent dump element ", + "paramRequired": true + }, + { + "paramName":"rt", + "paramLongName":"resultType", + "paramDescription": "the name of the corresondent dump element ", + "paramRequired": true } -] \ No newline at end of file +] + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/oozie_app/workflow.xml index 8566d7667..29641a520 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/oozie_app/workflow.xml @@ -1,10 +1,18 @@ - - inputPath - the source path - + + sourcePath + the source path + + + isLookUpUrl + the isLookup service endpoint + + + outputPath + the output path + hiveDbName the target hive database name @@ -72,45 +80,35 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - hive.metastore.uris - ${hiveMetastoreUris} - - - ${hiveJdbcUrl}/${hiveDbName} - - hiveDbName=${hiveDbName} - - + + + + + + - - - - - - - - - + + + + + + - + yarn cluster - Import table publication - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Dump table publication for community related products + eu.dnetlib.dhp.oa.graph.dump.SparkDumpCommunityProducts dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -122,21 +120,23 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath}/publication - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Publication - --hiveMetastoreUris${hiveMetastoreUris} + --sourcePath${inputPath}/publication + --inputTypepublication + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --dumpTableNameeu.dnetlib.dhp.schema.dump.oaf.Publication + --outputPath${workingDir}/publication + --isLookUpUrl${isLoohUpUrl} - + - + yarn cluster - Import table dataset - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Dump table dataset for community related products + eu.dnetlib.dhp.oa.graph.dump.SparkDumpCommunityProducts dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -148,21 +148,23 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath}/dataset - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Dataset - --hiveMetastoreUris${hiveMetastoreUris} + --sourcePath${inputPath}/dataset + --inputTypedataset + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --dumpTableNameeu.dnetlib.dhp.schema.dump.oaf.Dataset + --outputPath${workingDir}/dataset + --isLookUpUrl${isLoohUpUrl} - + - + yarn cluster - Import table otherresearchproduct - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Dump table ORP for community related products + eu.dnetlib.dhp.oa.graph.dump.SparkDumpCommunityProducts dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -174,21 +176,23 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath}/otherresearchproduct - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --hiveMetastoreUris${hiveMetastoreUris} + --sourcePath${inputPath}/otherresearchproduct + --inputTypeotherresearchproduct + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --dumpTableNameeu.dnetlib.dhp.schema.dump.oaf.OtherResearchProduct + --outputPath${workingDir}/otherresearchproduct + --isLookUpUrl${isLoohUpUrl} - + - + yarn cluster - Import table software - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Dump table software for community related products + eu.dnetlib.dhp.oa.graph.dump.SparkDumpCommunityProducts dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -200,21 +204,25 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath}/software - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Software - --hiveMetastoreUris${hiveMetastoreUris} + --sourcePath${inputPath}/software + --inputTypesoftware + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --dumpTableNameeu.dnetlib.dhp.schema.dump.oaf.Software + --outputPath${workingDir}/software + --isLookUpUrl${isLoohUpUrl} - + - + + + yarn cluster - Import table datasource - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Prepare association result subset of project info + eu.dnetlib.dhp.oa.graph.dump.SparkPrepareResultProject dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -226,21 +234,26 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath}/datasource - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Datasource - --hiveMetastoreUris${hiveMetastoreUris} + --sourcePath${inputPath} + --outputPath${workingDir}/preparedInfo - + - + + + + + + + + yarn cluster - Import table organization - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Extend dumped publications with information about projects + eu.dnetlib.dhp.oa.graph.dump.SparkUpdateProjectInfo dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -252,21 +265,21 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath}/organization - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Organization - --hiveMetastoreUris${hiveMetastoreUris} + --sourcePath${workingDir}/publication + --outputPath${workingDir}/ext/publication + --resultTableNameeu.dnetlib.dhp.schema.dump.oaf.Publication + --preparedInfoPath${workingDir}/preparedInfo - + - + yarn cluster - Import table project - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Extend dumped dataset with information about projects + eu.dnetlib.dhp.oa.graph.dump.SparkUpdateProjectInfo dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -278,21 +291,20 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath}/project - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Project - --hiveMetastoreUris${hiveMetastoreUris} + --sourcePath${workingDir}/dataset + --outputPath${workingDir}/ext/dataset + --resultTableNameeu.dnetlib.dhp.schema.dump.oaf.Dataset + --preparedInfoPath${workingDir}/preparedInfo - + - - + yarn cluster - Import table project - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Extend dumped ORP with information about projects + eu.dnetlib.dhp.oa.graph.dump.SparkUpdateProjectInfo dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -304,33 +316,182 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath}/relation - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Relation - --hiveMetastoreUris${hiveMetastoreUris} + --sourcePath${workingDir}/otherresearchproduct + --outputPath${workingDir}/ext/otherresearchproduct + --resultTableNameeu.dnetlib.dhp.schema.dump.oaf.OtherResearchProduct + --preparedInfoPath${workingDir}/preparedInfo - + + + + + + yarn + cluster + Extend dumped software with information about projects + eu.dnetlib.dhp.oa.graph.dump.SparkUpdateProjectInfo + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/software + --outputPath${workingDir}/ext/software + --resultTableNameeu.dnetlib.dhp.schema.dump.oaf.Software + --preparedInfoPath${workingDir}/preparedInfo + + - - - - - - hive.metastore.uris - ${hiveMetastoreUris} - - - ${hiveJdbcUrl}/${hiveDbName} - - hiveDbName=${hiveDbName} - - + + + + + + + + + + + + + yarn + cluster + Split dumped result for community + eu.dnetlib.dhp.oa.graph.dump.SparkSplitForCommunity + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/ext/publication + --outputPath${outputPath} + --resultTableNameeu.dnetlib.dhp.schema.dump.oaf.Software + --isLookUpUrl${isLoohUpUrl} + + + + + yarn + cluster + Split dumped result for community + eu.dnetlib.dhp.oa.graph.dump.SparkSplitForCommunity + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/ext/dataset + --outputPath${outputPath} + --classNameeu.dnetlib.dhp.schema.dump.oaf.Dataset + --isLookUpUrl${isLoohUpUrl} + + + + + + + yarn + cluster + Split dumped result for community + eu.dnetlib.dhp.oa.graph.dump.SparkSplitForCommunity + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/ext/orp + --outputPath${outputPath} + --classNameeu.dnetlib.dhp.schema.dump.oaf.OtherResearchProduct + --isLookUpUrl${isLoohUpUrl} + + + + + + + yarn + cluster + Split dumped result for community + eu.dnetlib.dhp.oa.graph.dump.SparkSplitForCommunity + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/ext/software + --outputPath${outputPath} + --classNameeu.dnetlib.dhp.schema.dump.oaf.Software + --isLookUpUrl${isLoohUpUrl} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json index 04f3c9e1b..e82801602 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json @@ -1,11 +1,6 @@ [ - { - "paramName":"is", - "paramLongName":"isLookUpUrl", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, + { "paramName":"s", "paramLongName":"sourcePath", @@ -31,18 +26,11 @@ "paramRequired": true }, { - "paramName":"dn", - "paramLongName":"dumpTableName", - "paramDescription": "the name of the corresondent dump element ", - "paramRequired": true - }, - { - "paramName":"rt", - "paramLongName":"resultType", - "paramDescription": "the name of the corresondent dump element ", + "paramName": "pip", + "paramLongName": "preparedInfoPath", + "paramDescription": "the path of the association result projectlist", "paramRequired": true } ] - diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_prepare_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_prepare_parameters.json index 7d995f39a..82714d973 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_prepare_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_prepare_parameters.json @@ -1,20 +1,20 @@ [ { - "paramName": "issm", + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", "paramLongName": "isSparkSessionManaged", - "paramDescription": "when true will stop SparkSession after job execution", + "paramDescription": "true if the spark session is managed, false otherwise", "paramRequired": false - }, - { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the source path", - "paramRequired": true - }, - { - "paramName": "g", - "paramLongName": "graphRawPath", - "paramDescription": "the path of the graph Raw in hdfs", - "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json index 04f3c9e1b..dc7d40ba1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json @@ -29,18 +29,6 @@ "paramLongName":"resultTableName", "paramDescription": "the name of the result table we are currently working on", "paramRequired": true - }, - { - "paramName":"dn", - "paramLongName":"dumpTableName", - "paramDescription": "the name of the corresondent dump element ", - "paramRequired": true - }, - { - "paramName":"rt", - "paramLongName":"resultType", - "paramDescription": "the name of the corresondent dump element ", - "paramRequired": true } ]