From 2ba67f08d37152babb09f8d44adb4fcd95f8b0a1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 25 Sep 2024 15:27:02 +0200 Subject: [PATCH] [OpenCitations] move the extracted contents under a backup path to avoid needing to re-download it in case of errors --- .../actionmanager/opencitations/ReadCOCI.java | 8 +++++++- .../input_readcoci_parameters.json | 19 +++++++++++++------ .../opencitations/oozie_app/workflow.xml | 1 + 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java index 479aea458..4b0bbf145 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java @@ -49,6 +49,9 @@ public class ReadCOCI implements Serializable { final String workingPath = parser.get("inputPath"); log.info("workingPath {}", workingPath); + final String backupPath = parser.get("backupPath"); + log.info("backupPath {}", backupPath); + SparkConf sconf = new SparkConf(); Configuration conf = new Configuration(); @@ -68,12 +71,14 @@ public class ReadCOCI implements Serializable { workingPath, fileSystem, outputPath, + backupPath, delimiter); }); } private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem, String outputPath, + String backupPath, String delimiter) throws IOException { RemoteIterator fileStatusListIterator = fileSystem .listFiles( @@ -107,7 +112,8 @@ public class ReadCOCI implements Serializable { .mode(SaveMode.Append) .option("compression", "gzip") .json(outputPath); - fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE")); + + fileSystem.rename(fileStatus.getPath(), new Path(backupPath)); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json index a74ceb983..d1f495d67 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json @@ -24,12 +24,19 @@ "paramLongName": "outputPath", "paramDescription": "the hdfs name node", "paramRequired": true - }, { - "paramName": "nn", - "paramLongName": "hdfsNameNode", - "paramDescription": "the hdfs name node", - "paramRequired": true -} + }, + { + "paramName": "nn", + "paramLongName": "hdfsNameNode", + "paramDescription": "the hdfs name node", + "paramRequired": true + }, + { + "paramName": "bp", + "paramLongName": "backupPath", + "paramDescription": "the hdfs path to move the OC data after the extraction", + "paramRequired": true + } ] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml index 566cf7d02..f170af96f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml @@ -129,6 +129,7 @@ --inputPath${inputPath}/Extracted --outputPath${inputPath}/JSON + --backupPath${inputPath}/backup --delimiter${delimiter} --hdfsNameNode${nameNode}