From 83c262a483a4ecc4d7a12d6ddab2086c2f447387 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 15 May 2020 18:18:31 +0200 Subject: [PATCH] workflow to download the files --- .../h2020programme/oozie_app/workflow.xml | 145 +++++++----------- 1 file changed, 57 insertions(+), 88 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/workflow.xml index 3e7f68401..9b200c2a9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/workflow.xml @@ -1,112 +1,81 @@ - + - sequenceFilePath - the path to store the sequence file of the native metadata collected + projectFileURL + the url where to get the projects file - mdStorePath - the path of the native mdstore + programmeFileURL + the url where to get the programme file - apiDescription - A json encoding of the API Description class - - - - dataSourceInfo - A json encoding of the Datasource Info - - - identifierPath - An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier - - - - metadataEncoding - The type of the metadata XML/JSON - - - - timestamp - The timestamp of the collection date - - - - workflowId - The identifier of the workflow + outputPath + path where to store the action set - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + - - - - + + - + - - - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.collection.worker.DnetCollectorWorker - -p${sequenceFilePath} - -a${apiDescription} - -n${nameNode} - -rh${rmq_host} - -ru${rmq_user} - -rp${rmq_pwd} - -rr${rmq_report} - -ro${rmq_ongoing} - -usandro.labruzzo - -w${workflowId} - - - - - - - ${jobTracker} - ${nameNode} - yarn - cluster - GenerateNativeStoreSparkJob - eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob - dhp-aggregations-1.0.0-SNAPSHOT.jar - --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" - --encoding ${metadataEncoding} - --dateOfCollection ${timestamp} - --provenance ${dataSourceInfo} - --xpath${identifierPath} - --input${sequenceFilePath} - --output${mdStorePath} - -rh${rmq_host} - -ru${rmq_user} - -rp${rmq_pwd} - -rr${rmq_report} - -ro${rmq_ongoing} - -w${workflowId} - - - - - - - - - + + + eu.dnetlib.dhp.actionset.h2020programme.GetFile + --hdfsNameNode${nameNode} + --fileUrl${projectFileURL} + --hdfsPath${workingDir}/projects.csv + + + + + eu.dnetlib.dhp.actionset.h2020programme.GetFile + --hdfsNameNode${nameNode} + --fileUrl${programmeFileURL} + --hdfsPath${workingDir}/programme.csv + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file