forked from D-Net/dnet-hadoop
compress the output produced by migration steps 1 and 2
This commit is contained in:
parent
2f11e37602
commit
c7e0730720
|
@ -15,6 +15,7 @@ import org.apache.commons.logging.LogFactory;
|
|||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -87,7 +88,7 @@ public class GenerateEntitiesApplication {
|
|||
.map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf)));
|
||||
}
|
||||
|
||||
inputRdd.saveAsTextFile(targetPath);
|
||||
inputRdd.saveAsTextFile(targetPath, GzipCodec.class);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
|
@ -60,7 +61,7 @@ public class DispatchEntitiesApplication {
|
|||
sc.textFile(sourcePath)
|
||||
.filter(l -> isEntityType(l, type))
|
||||
.map(l -> StringUtils.substringAfter(l, "|"))
|
||||
.saveAsTextFile(targetPath + "/" + type); // use repartition(XXX) ???
|
||||
.saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ???
|
||||
}
|
||||
|
||||
private static boolean isEntityType(final String line, final String type) {
|
||||
|
|
Loading…
Reference in New Issue