[OpenCitation] add compression option when writing the sequence file

This commit is contained in:
Miriam Baglioni 2024-04-03 09:25:00 +02:00
parent 4f0a044245
commit 42846d3b91
1 changed files with 1 additions and 1 deletions

View File

@ -88,7 +88,7 @@ public class CreateActionSetSparkJob implements Serializable {
private static void extractContent(SparkSession spark, String inputPath, String outputPath) {
getTextTextJavaPairRDD(spark, inputPath)
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);// , GzipCodec.class);
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(SparkSession spark, String inputPath) {