[OpenCitation] change to extract in json format each folder just onece

2022-02-08 15:37:28 +01:00 · 2022-02-08 15:37:28 +01:00 · b071f8e415
parent fbc28ee8c3
commit b071f8e415
4 changed files with 70 additions and 38 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java
@ -41,18 +41,14 @@ public class ReadCOCI implements Serializable {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);

-		final String hdfsNameNode = parser.get("nameNode");
-		log.info("nameNode: {}", hdfsNameNode);
-
-		final String inputPath = parser.get("sourcePath");
-		log.info("input path : {}", inputPath);
+		final String[] inputFile = parser.get("inputFile").split(";");
+		log.info("inputFile {}", inputFile.toString());
 		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		Configuration conf = new Configuration();
-		conf.set("fs.defaultFS", hdfsNameNode);
+		final String workingPath = parser.get("workingPath");
+		log.info("workingPath {}", workingPath);

-		FileSystem fileSystem = FileSystem.get(conf);
 		SparkConf sconf = new SparkConf();

 		final String delimiter = Optional
@ -65,25 +61,20 @@ public class ReadCOCI implements Serializable {
 			spark -> {
 				doRead(
 					spark,
-					fileSystem,
-					inputPath,
+					workingPath,
+					inputFile,
 					outputPath,
 					delimiter);
 			});
 	}

-	public static void doRead(SparkSession spark, FileSystem fileSystem, String inputPath, String outputPath,
+	private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
+		String outputPath,
 		String delimiter) throws IOException {

-		RemoteIterator<LocatedFileStatus> iterator = fileSystem
-			.listFiles(
-				new Path(inputPath), true);
+		for(String inputFile : inputFiles){
+			String p_string = workingPath + "/" + inputFile ;

-		while (iterator.hasNext()) {
-			LocatedFileStatus fileStatus = iterator.next();
-
-			Path p = fileStatus.getPath();
-			String p_string = p.toString();
 			Dataset<Row> cociData = spark
 				.read()
 				.format("csv")
@ -91,7 +82,8 @@ public class ReadCOCI implements Serializable {
 				.option("inferSchema", "true")
 				.option("header", "true")
 				.option("quotes", "\"")
-				.load(p_string);
+				.load(p_string)
+				.repartition(100);

 			cociData.map((MapFunction<Row, COCI>) row -> {
 				COCI coci = new COCI();
@ -103,7 +95,7 @@ public class ReadCOCI implements Serializable {
 				.write()
 				.mode(SaveMode.Overwrite)
 				.option("compression", "gzip")
-				.json(outputPath + "/" + p_string.substring(p_string.lastIndexOf("/") + 1));
+				.json(outputPath + inputFile);
 		}

 	}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json
@ -1,17 +1,12 @@
 [
  {
-    "paramName": "sp",
-    "paramLongName": "sourcePath",
+    "paramName": "wp",
+    "paramLongName": "workingPath",
    "paramDescription": "the zipped opencitations file",
    "paramRequired": true
  },

-  {
-    "paramName": "nn",
-    "paramLongName": "nameNode",
-    "paramDescription": "the hdfs name node",
-    "paramRequired": true
-  },
+
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
@ -28,7 +23,13 @@
    "paramName": "op",
    "paramLongName": "outputPath",
    "paramDescription": "the hdfs name node",
-    "paramRequired": false
+    "paramRequired": true
+  },
+  {
+    "paramName": "if",
+    "paramLongName": "inputFile",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": true
  }
 ]

--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml
@ -82,10 +82,10 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
-            <arg>--sourcePath</arg><arg>${workingPath}/COCI</arg>
-            <arg>--outputPath</arg><arg>${workingDir}/COCI</arg>
-            <arg>--nameNode</arg><arg>${nameNode}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
+            <arg>--outputPath</arg><arg>${workingPath}/COCI_JSON</arg>
            <arg>--delimiter</arg><arg>${delimiter}</arg>
+            <arg>--inputFile</arg><arg>${inputFileCoci}</arg>
        </spark>
        <ok to="create_actionset"/>
        <error to="Kill"/>
@ -108,7 +108,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
-            <arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
+            <arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@ -10,6 +10,7 @@ import java.nio.file.Path;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@ -73,15 +74,53 @@ public class ReadCOCITest {
 				"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
 			.getPath();

+		LocalFileSystem fs = FileSystem.getLocal(new Configuration());
+		fs
+				.copyFromLocalFile(
+						false, new org.apache.hadoop.fs.Path(getClass()
+								.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1")
+								.getPath()),
+						new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1"));
+
+		fs
+				.copyFromLocalFile(
+						false, new org.apache.hadoop.fs.Path(getClass()
+								.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2")
+								.getPath()),
+						new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2"));
+
+		fs
+				.copyFromLocalFile(
+						false, new org.apache.hadoop.fs.Path(getClass()
+								.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3")
+								.getPath()),
+						new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3"));
+
+		fs
+				.copyFromLocalFile(
+						false, new org.apache.hadoop.fs.Path(getClass()
+								.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4")
+								.getPath()),
+						new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4"));
+
 		ReadCOCI
-			.doRead(
-				spark, FileSystem.getLocal(new Configuration()), inputPath,
-				workingDir.toString() + "/COCI", DEFAULT_DELIMITER);
+				.main(
+						new String[] {
+								"-isSparkSessionManaged",
+								Boolean.FALSE.toString(),
+								"-workingPath",
+								workingDir.toString() + "/COCI",
+								"-outputPath",
+								workingDir.toString() + "/COCI_json/",
+								"-inputFile", "input1;input2;input3;input4"
+						});
+
+

 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

 		JavaRDD<COCI> tmp = sc
-			.textFile(workingDir.toString() + "/COCI/*/")
+			.textFile(workingDir.toString() + "/COCI_json/*/")
 			.map(item -> OBJECT_MAPPER.readValue(item, COCI.class));

 		Assertions.assertEquals(23, tmp.count());