From 524e5f30921731421b4da85d4043034c5cdcb2ac Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 19 Apr 2021 15:17:25 +0200 Subject: [PATCH] Improved parallelization on transformation wf on hadoop --- .../dhp/transformation/TransformSparkJobNode.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 8eafaadca..c7ba700a0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -149,4 +149,14 @@ public class TransformSparkJobNode { } } + /** + * Calculates the number of partitions allocating at most @rpt records for a single transformation task. + * @param totalInput + * @param rpt + * @return + */ + private static int getRepartitionNumber(long totalInput, Integer rpt) { + return (int) (totalInput / rpt); + } + }