forked from D-Net/dnet-hadoop
[OpenCitation] change to extract in json format each folder just onece
This commit is contained in:
parent
fbc28ee8c3
commit
b071f8e415
|
@ -41,18 +41,14 @@ public class ReadCOCI implements Serializable {
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
final String hdfsNameNode = parser.get("nameNode");
|
final String[] inputFile = parser.get("inputFile").split(";");
|
||||||
log.info("nameNode: {}", hdfsNameNode);
|
log.info("inputFile {}", inputFile.toString());
|
||||||
|
|
||||||
final String inputPath = parser.get("sourcePath");
|
|
||||||
log.info("input path : {}", inputPath);
|
|
||||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
final String workingPath = parser.get("workingPath");
|
||||||
conf.set("fs.defaultFS", hdfsNameNode);
|
log.info("workingPath {}", workingPath);
|
||||||
|
|
||||||
FileSystem fileSystem = FileSystem.get(conf);
|
|
||||||
SparkConf sconf = new SparkConf();
|
SparkConf sconf = new SparkConf();
|
||||||
|
|
||||||
final String delimiter = Optional
|
final String delimiter = Optional
|
||||||
|
@ -65,25 +61,20 @@ public class ReadCOCI implements Serializable {
|
||||||
spark -> {
|
spark -> {
|
||||||
doRead(
|
doRead(
|
||||||
spark,
|
spark,
|
||||||
fileSystem,
|
workingPath,
|
||||||
inputPath,
|
inputFile,
|
||||||
outputPath,
|
outputPath,
|
||||||
delimiter);
|
delimiter);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void doRead(SparkSession spark, FileSystem fileSystem, String inputPath, String outputPath,
|
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
|
||||||
|
String outputPath,
|
||||||
String delimiter) throws IOException {
|
String delimiter) throws IOException {
|
||||||
|
|
||||||
RemoteIterator<LocatedFileStatus> iterator = fileSystem
|
for(String inputFile : inputFiles){
|
||||||
.listFiles(
|
String p_string = workingPath + "/" + inputFile ;
|
||||||
new Path(inputPath), true);
|
|
||||||
|
|
||||||
while (iterator.hasNext()) {
|
|
||||||
LocatedFileStatus fileStatus = iterator.next();
|
|
||||||
|
|
||||||
Path p = fileStatus.getPath();
|
|
||||||
String p_string = p.toString();
|
|
||||||
Dataset<Row> cociData = spark
|
Dataset<Row> cociData = spark
|
||||||
.read()
|
.read()
|
||||||
.format("csv")
|
.format("csv")
|
||||||
|
@ -91,7 +82,8 @@ public class ReadCOCI implements Serializable {
|
||||||
.option("inferSchema", "true")
|
.option("inferSchema", "true")
|
||||||
.option("header", "true")
|
.option("header", "true")
|
||||||
.option("quotes", "\"")
|
.option("quotes", "\"")
|
||||||
.load(p_string);
|
.load(p_string)
|
||||||
|
.repartition(100);
|
||||||
|
|
||||||
cociData.map((MapFunction<Row, COCI>) row -> {
|
cociData.map((MapFunction<Row, COCI>) row -> {
|
||||||
COCI coci = new COCI();
|
COCI coci = new COCI();
|
||||||
|
@ -103,7 +95,7 @@ public class ReadCOCI implements Serializable {
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + "/" + p_string.substring(p_string.lastIndexOf("/") + 1));
|
.json(outputPath + inputFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,17 +1,12 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"paramName": "sp",
|
"paramName": "wp",
|
||||||
"paramLongName": "sourcePath",
|
"paramLongName": "workingPath",
|
||||||
"paramDescription": "the zipped opencitations file",
|
"paramDescription": "the zipped opencitations file",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
|
||||||
"paramName": "nn",
|
|
||||||
"paramLongName": "nameNode",
|
|
||||||
"paramDescription": "the hdfs name node",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName": "issm",
|
"paramName": "issm",
|
||||||
"paramLongName": "isSparkSessionManaged",
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
@ -28,7 +23,13 @@
|
||||||
"paramName": "op",
|
"paramName": "op",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": false
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "if",
|
||||||
|
"paramLongName": "inputFile",
|
||||||
|
"paramDescription": "the hdfs name node",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -82,10 +82,10 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${workingPath}/COCI</arg>
|
<arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/COCI</arg>
|
<arg>--outputPath</arg><arg>${workingPath}/COCI_JSON</arg>
|
||||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||||
|
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="create_actionset"/>
|
<ok to="create_actionset"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -108,7 +108,7 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
|
<arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -10,6 +10,7 @@ import java.nio.file.Path;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.LocalFileSystem;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
@ -73,15 +74,53 @@ public class ReadCOCITest {
|
||||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
|
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
||||||
|
fs
|
||||||
|
.copyFromLocalFile(
|
||||||
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1")
|
||||||
|
.getPath()),
|
||||||
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1"));
|
||||||
|
|
||||||
|
fs
|
||||||
|
.copyFromLocalFile(
|
||||||
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2")
|
||||||
|
.getPath()),
|
||||||
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2"));
|
||||||
|
|
||||||
|
fs
|
||||||
|
.copyFromLocalFile(
|
||||||
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3")
|
||||||
|
.getPath()),
|
||||||
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3"));
|
||||||
|
|
||||||
|
fs
|
||||||
|
.copyFromLocalFile(
|
||||||
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4")
|
||||||
|
.getPath()),
|
||||||
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4"));
|
||||||
|
|
||||||
ReadCOCI
|
ReadCOCI
|
||||||
.doRead(
|
.main(
|
||||||
spark, FileSystem.getLocal(new Configuration()), inputPath,
|
new String[] {
|
||||||
workingDir.toString() + "/COCI", DEFAULT_DELIMITER);
|
"-isSparkSessionManaged",
|
||||||
|
Boolean.FALSE.toString(),
|
||||||
|
"-workingPath",
|
||||||
|
workingDir.toString() + "/COCI",
|
||||||
|
"-outputPath",
|
||||||
|
workingDir.toString() + "/COCI_json/",
|
||||||
|
"-inputFile", "input1;input2;input3;input4"
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<COCI> tmp = sc
|
JavaRDD<COCI> tmp = sc
|
||||||
.textFile(workingDir.toString() + "/COCI/*/")
|
.textFile(workingDir.toString() + "/COCI_json/*/")
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
|
||||||
|
|
||||||
Assertions.assertEquals(23, tmp.count());
|
Assertions.assertEquals(23, tmp.count());
|
||||||
|
|
Loading…
Reference in New Issue