[OpenCitation] change to extract in json format each folder just onece

This commit is contained in:
Miriam Baglioni 2022-02-08 15:37:28 +01:00
parent fbc28ee8c3
commit b071f8e415
4 changed files with 70 additions and 38 deletions

View File

@ -41,18 +41,14 @@ public class ReadCOCI implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final String hdfsNameNode = parser.get("nameNode"); final String[] inputFile = parser.get("inputFile").split(";");
log.info("nameNode: {}", hdfsNameNode); log.info("inputFile {}", inputFile.toString());
final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser); Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
Configuration conf = new Configuration(); final String workingPath = parser.get("workingPath");
conf.set("fs.defaultFS", hdfsNameNode); log.info("workingPath {}", workingPath);
FileSystem fileSystem = FileSystem.get(conf);
SparkConf sconf = new SparkConf(); SparkConf sconf = new SparkConf();
final String delimiter = Optional final String delimiter = Optional
@ -65,25 +61,20 @@ public class ReadCOCI implements Serializable {
spark -> { spark -> {
doRead( doRead(
spark, spark,
fileSystem, workingPath,
inputPath, inputFile,
outputPath, outputPath,
delimiter); delimiter);
}); });
} }
public static void doRead(SparkSession spark, FileSystem fileSystem, String inputPath, String outputPath, private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
String outputPath,
String delimiter) throws IOException { String delimiter) throws IOException {
RemoteIterator<LocatedFileStatus> iterator = fileSystem for(String inputFile : inputFiles){
.listFiles( String p_string = workingPath + "/" + inputFile ;
new Path(inputPath), true);
while (iterator.hasNext()) {
LocatedFileStatus fileStatus = iterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
Dataset<Row> cociData = spark Dataset<Row> cociData = spark
.read() .read()
.format("csv") .format("csv")
@ -91,7 +82,8 @@ public class ReadCOCI implements Serializable {
.option("inferSchema", "true") .option("inferSchema", "true")
.option("header", "true") .option("header", "true")
.option("quotes", "\"") .option("quotes", "\"")
.load(p_string); .load(p_string)
.repartition(100);
cociData.map((MapFunction<Row, COCI>) row -> { cociData.map((MapFunction<Row, COCI>) row -> {
COCI coci = new COCI(); COCI coci = new COCI();
@ -103,7 +95,7 @@ public class ReadCOCI implements Serializable {
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath + "/" + p_string.substring(p_string.lastIndexOf("/") + 1)); .json(outputPath + inputFile);
} }
} }

View File

@ -1,17 +1,12 @@
[ [
{ {
"paramName": "sp", "paramName": "wp",
"paramLongName": "sourcePath", "paramLongName": "workingPath",
"paramDescription": "the zipped opencitations file", "paramDescription": "the zipped opencitations file",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "nn",
"paramLongName": "nameNode",
"paramDescription": "the hdfs name node",
"paramRequired": true
},
{ {
"paramName": "issm", "paramName": "issm",
"paramLongName": "isSparkSessionManaged", "paramLongName": "isSparkSessionManaged",
@ -28,7 +23,13 @@
"paramName": "op", "paramName": "op",
"paramLongName": "outputPath", "paramLongName": "outputPath",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": false "paramRequired": true
},
{
"paramName": "if",
"paramLongName": "inputFile",
"paramDescription": "the hdfs name node",
"paramRequired": true
} }
] ]

View File

@ -82,10 +82,10 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${workingPath}/COCI</arg> <arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
<arg>--outputPath</arg><arg>${workingDir}/COCI</arg> <arg>--outputPath</arg><arg>${workingPath}/COCI_JSON</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--delimiter</arg><arg>${delimiter}</arg> <arg>--delimiter</arg><arg>${delimiter}</arg>
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
</spark> </spark>
<ok to="create_actionset"/> <ok to="create_actionset"/>
<error to="Kill"/> <error to="Kill"/>
@ -108,7 +108,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg> <arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg> <arg>--outputPath</arg><arg>${outputPath}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>

View File

@ -10,6 +10,7 @@ import java.nio.file.Path;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@ -73,15 +74,53 @@ public class ReadCOCITest {
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles") "/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath(); .getPath();
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4"));
ReadCOCI ReadCOCI
.doRead( .main(
spark, FileSystem.getLocal(new Configuration()), inputPath, new String[] {
workingDir.toString() + "/COCI", DEFAULT_DELIMITER); "-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-workingPath",
workingDir.toString() + "/COCI",
"-outputPath",
workingDir.toString() + "/COCI_json/",
"-inputFile", "input1;input2;input3;input4"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<COCI> tmp = sc JavaRDD<COCI> tmp = sc
.textFile(workingDir.toString() + "/COCI/*/") .textFile(workingDir.toString() + "/COCI_json/*/")
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class)); .map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
Assertions.assertEquals(23, tmp.count()); Assertions.assertEquals(23, tmp.count());