forked from D-Net/dnet-hadoop
[ImportOC] fix to move original folder instead of extracted ones
This commit is contained in:
parent
599e56dbc6
commit
e430826e00
|
@ -46,6 +46,9 @@ public class GetOpenCitationsRefs implements Serializable {
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath {}", outputPath);
|
log.info("outputPath {}", outputPath);
|
||||||
|
|
||||||
|
final String backupPath = parser.get("backupPath");
|
||||||
|
log.info("backupPath {}", backupPath);
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
conf.set("fs.defaultFS", hdfsNameNode);
|
conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
|
|
||||||
|
@ -53,11 +56,11 @@ public class GetOpenCitationsRefs implements Serializable {
|
||||||
|
|
||||||
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
||||||
|
|
||||||
ocr.doExtract(inputPath, outputPath, fileSystem);
|
ocr.doExtract(inputPath, outputPath, backupPath, fileSystem);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doExtract(String inputPath, String outputPath, FileSystem fileSystem)
|
private void doExtract(String inputPath, String outputPath, String backupPath, FileSystem fileSystem)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||||
|
@ -89,6 +92,7 @@ public class GetOpenCitationsRefs implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,9 +49,6 @@ public class ReadCOCI implements Serializable {
|
||||||
final String workingPath = parser.get("inputPath");
|
final String workingPath = parser.get("inputPath");
|
||||||
log.info("workingPath {}", workingPath);
|
log.info("workingPath {}", workingPath);
|
||||||
|
|
||||||
final String backupPath = parser.get("backupPath");
|
|
||||||
log.info("backupPath {}", backupPath);
|
|
||||||
|
|
||||||
SparkConf sconf = new SparkConf();
|
SparkConf sconf = new SparkConf();
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
|
@ -71,14 +68,12 @@ public class ReadCOCI implements Serializable {
|
||||||
workingPath,
|
workingPath,
|
||||||
fileSystem,
|
fileSystem,
|
||||||
outputPath,
|
outputPath,
|
||||||
backupPath,
|
|
||||||
delimiter);
|
delimiter);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
|
||||||
String outputPath,
|
String outputPath,
|
||||||
String backupPath,
|
|
||||||
String delimiter) throws IOException {
|
String delimiter) throws IOException {
|
||||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||||
.listFiles(
|
.listFiles(
|
||||||
|
@ -113,7 +108,7 @@ public class ReadCOCI implements Serializable {
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
|
|
||||||
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
fileSystem.delete(fileStatus.getPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,5 +16,11 @@
|
||||||
"paramLongName": "hdfsNameNode",
|
"paramLongName": "hdfsNameNode",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "bp",
|
||||||
|
"paramLongName": "backupPath",
|
||||||
|
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -30,12 +30,6 @@
|
||||||
"paramLongName": "hdfsNameNode",
|
"paramLongName": "hdfsNameNode",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "bp",
|
|
||||||
"paramLongName": "backupPath",
|
|
||||||
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
|
||||||
"paramRequired": true
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -94,17 +94,7 @@
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/Original</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/Original</arg>
|
||||||
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
|
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||||
</java>
|
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
||||||
<ok to="read"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="extract_correspondence">
|
|
||||||
<java>
|
|
||||||
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
|
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/correspondence</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${inputPath}/correspondence_extracted</arg>
|
|
||||||
</java>
|
</java>
|
||||||
<ok to="read"/>
|
<ok to="read"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -129,7 +119,6 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||||
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
|
||||||
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
|
||||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
Loading…
Reference in New Issue