gzipped output

This commit is contained in:
Michele Artini 2020-12-10 11:59:28 +01:00
parent b6f08ce226
commit ff41a7b3a4
3 changed files with 15 additions and 18 deletions

View File

@ -30,8 +30,7 @@ public class CheckDuplictedIdsJob {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(CheckDuplictedIdsJob.class
CheckDuplictedIdsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args); parser.parseArgument(args);
@ -59,8 +58,8 @@ public class CheckDuplictedIdsJob {
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG())) .map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.json(countPath); .option("compression", "gzip")
; .json(countPath);;
} }

View File

@ -42,8 +42,7 @@ public class PartitionEventsByDsIdJob {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(PartitionEventsByDsIdJob.class
PartitionEventsByDsIdJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json"))); .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
parser.parseArgument(args); parser.parseArgument(args);
@ -67,8 +66,7 @@ public class PartitionEventsByDsIdJob {
final Set<String> validOpendoarIds = new HashSet<>(); final Set<String> validOpendoarIds = new HashSet<>();
if (!opendoarIds.trim().equals("-")) { if (!opendoarIds.trim().equals("-")) {
validOpendoarIds validOpendoarIds
.addAll( .addAll(Arrays
Arrays
.stream(opendoarIds.split(",")) .stream(opendoarIds.split(","))
.map(String::trim) .map(String::trim)
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)
@ -84,13 +82,12 @@ public class PartitionEventsByDsIdJob {
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId())) .filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX)) .filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId())) .filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
.map( .map((MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
Encoders.bean(ShortEventMessageWithGroupId.class))
.coalesce(1) .coalesce(1)
.write() .write()
.partitionBy("group") .partitionBy("group")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(partitionPath); .json(partitionPath);
}); });

View File

@ -67,6 +67,7 @@ public class ClusterUtils {
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz)) .map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(path); .json(path);
} }