[OpenCitations] move the extracted contents under a backup path to avoid needing to re-download it in case of errors
This commit is contained in:
parent
535a7b99f1
commit
e354f9853a
|
@ -49,6 +49,9 @@ public class ReadCOCI implements Serializable {
|
|||
final String workingPath = parser.get("inputPath");
|
||||
log.info("workingPath {}", workingPath);
|
||||
|
||||
final String backupPath = parser.get("backupPath");
|
||||
log.info("backupPath {}", backupPath);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
|
@ -68,12 +71,14 @@ public class ReadCOCI implements Serializable {
|
|||
workingPath,
|
||||
fileSystem,
|
||||
outputPath,
|
||||
backupPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
||||
String outputPath,
|
||||
String backupPath,
|
||||
String delimiter) throws IOException {
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
.listFiles(
|
||||
|
@ -107,7 +112,8 @@ public class ReadCOCI implements Serializable {
|
|||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE"));
|
||||
|
||||
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -24,11 +24,18 @@
|
|||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}, {
|
||||
},
|
||||
{
|
||||
"paramName": "nn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bp",
|
||||
"paramLongName": "backupPath",
|
||||
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
|
|
@ -129,6 +129,7 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
||||
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
</spark>
|
||||
|
|
Loading…
Reference in New Issue