forked from antonis.lempesis/dnet-hadoop
gzipped output
This commit is contained in:
parent
b6f08ce226
commit
ff41a7b3a4
|
@ -30,9 +30,8 @@ public class CheckDuplictedIdsJob {
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(CheckDuplictedIdsJob.class
|
||||||
CheckDuplictedIdsJob.class
|
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -59,8 +58,8 @@ public class CheckDuplictedIdsJob {
|
||||||
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.json(countPath);
|
.option("compression", "gzip")
|
||||||
;
|
.json(countPath);;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -42,9 +42,8 @@ public class PartitionEventsByDsIdJob {
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(PartitionEventsByDsIdJob.class
|
||||||
PartitionEventsByDsIdJob.class
|
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
|
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final Boolean isSparkSessionManaged = Optional
|
final Boolean isSparkSessionManaged = Optional
|
||||||
|
@ -67,13 +66,12 @@ public class PartitionEventsByDsIdJob {
|
||||||
final Set<String> validOpendoarIds = new HashSet<>();
|
final Set<String> validOpendoarIds = new HashSet<>();
|
||||||
if (!opendoarIds.trim().equals("-")) {
|
if (!opendoarIds.trim().equals("-")) {
|
||||||
validOpendoarIds
|
validOpendoarIds
|
||||||
.addAll(
|
.addAll(Arrays
|
||||||
Arrays
|
.stream(opendoarIds.split(","))
|
||||||
.stream(opendoarIds.split(","))
|
.map(String::trim)
|
||||||
.map(String::trim)
|
.filter(StringUtils::isNotBlank)
|
||||||
.filter(StringUtils::isNotBlank)
|
.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
|
||||||
.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
|
.collect(Collectors.toSet()));
|
||||||
.collect(Collectors.toSet()));
|
|
||||||
}
|
}
|
||||||
log.info("validOpendoarIds: {}", validOpendoarIds);
|
log.info("validOpendoarIds: {}", validOpendoarIds);
|
||||||
|
|
||||||
|
@ -84,13 +82,12 @@ public class PartitionEventsByDsIdJob {
|
||||||
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
||||||
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
|
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
|
||||||
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
|
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
|
||||||
.map(
|
.map((MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
|
||||||
(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
|
|
||||||
Encoders.bean(ShortEventMessageWithGroupId.class))
|
|
||||||
.coalesce(1)
|
.coalesce(1)
|
||||||
.write()
|
.write()
|
||||||
.partitionBy("group")
|
.partitionBy("group")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
.json(partitionPath);
|
.json(partitionPath);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
|
@ -67,6 +67,7 @@ public class ClusterUtils {
|
||||||
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
.json(path);
|
.json(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue