gzipped output

This commit is contained in:
Michele Artini 2020-12-10 11:59:28 +01:00
parent 27e96767e0
commit 94bfed1c84
3 changed files with 15 additions and 18 deletions

View File

@ -30,9 +30,8 @@ public class CheckDuplictedIdsJob {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(CheckDuplictedIdsJob.class
CheckDuplictedIdsJob.class .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args); parser.parseArgument(args);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
@ -59,8 +58,8 @@ public class CheckDuplictedIdsJob {
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG())) .map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.json(countPath); .option("compression", "gzip")
; .json(countPath);;
} }

View File

@ -42,9 +42,8 @@ public class PartitionEventsByDsIdJob {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(PartitionEventsByDsIdJob.class
PartitionEventsByDsIdJob.class .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
parser.parseArgument(args); parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional final Boolean isSparkSessionManaged = Optional
@ -67,13 +66,12 @@ public class PartitionEventsByDsIdJob {
final Set<String> validOpendoarIds = new HashSet<>(); final Set<String> validOpendoarIds = new HashSet<>();
if (!opendoarIds.trim().equals("-")) { if (!opendoarIds.trim().equals("-")) {
validOpendoarIds validOpendoarIds
.addAll( .addAll(Arrays
Arrays .stream(opendoarIds.split(","))
.stream(opendoarIds.split(",")) .map(String::trim)
.map(String::trim) .filter(StringUtils::isNotBlank)
.filter(StringUtils::isNotBlank) .map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s)) .collect(Collectors.toSet()));
.collect(Collectors.toSet()));
} }
log.info("validOpendoarIds: {}", validOpendoarIds); log.info("validOpendoarIds: {}", validOpendoarIds);
@ -84,13 +82,12 @@ public class PartitionEventsByDsIdJob {
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId())) .filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX)) .filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId())) .filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
.map( .map((MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
Encoders.bean(ShortEventMessageWithGroupId.class))
.coalesce(1) .coalesce(1)
.write() .write()
.partitionBy("group") .partitionBy("group")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(partitionPath); .json(partitionPath);
}); });

View File

@ -67,6 +67,7 @@ public class ClusterUtils {
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz)) .map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(path); .json(path);
} }