package eu.dnetlib.pace.utils; import com.google.common.collect.Lists; import eu.dnetlib.support.Block; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import scala.Tuple2; import java.io.Serializable; import java.math.BigInteger; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; public class BlockUtils implements Serializable { public static double getOptimalComparisonNumber(JavaRDD blocks) { double SMOOTHING_FACTOR = 1.05; //pairRDD: cardinality, #elements List>> collect = blocks.mapToPair(b -> new Tuple2<>(b.comparisons(), b.elements())) .mapToPair(bs -> new Tuple2<>(bs._1(), new Tuple2<>(bs._1(), bs._2()))) .reduceByKey((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2())).collect(); collect = new ArrayList<>(collect); collect.sort(Comparator.comparing(Tuple2::_1)); double[] blockAssignments = new double[collect.size()]; double[] comparisonsLevel = new double[collect.size()]; double[] totalComparisonsPerLevel = new double[collect.size()]; Integer totalComparisons = collect.get(0)._2()._1(); Integer totalBlockSize = collect.get(0)._2()._2(); blockAssignments[0] = totalBlockSize; comparisonsLevel[0] = collect.get(0)._1(); totalComparisonsPerLevel[0] = totalComparisons; for (int i=1; i blocks){ BigInteger numberOfComparisons = BigInteger.ZERO; BigInteger totalSizeOfBlocks = BigInteger.ZERO; BigInteger blockSize; //block_size, frequency JavaPairRDD blocksFreq = blocks.mapToPair(b -> new Tuple2<>(b.getKey(), b.elements())) .mapToPair(bs -> new Tuple2<>(bs._2(),1)) .reduceByKey((a,b) -> a+b).sortByKey(); ArrayList> blockSizesAndFreq = new ArrayList<>(blocksFreq.collect()); double CC = 0d; int freq; /* * statistics: array of pairs (blockSize, CC) for every blockSize */ ArrayList> statistics = new ArrayList<>(); for (int i = 0; i < blockSizesAndFreq.size(); i++) { blockSize = new BigInteger(blockSizesAndFreq.get(i)._1.toString()); freq = blockSizesAndFreq.get(i)._2; totalSizeOfBlocks = totalSizeOfBlocks.add(BigInteger.valueOf(freq).multiply(blockSize)); //accumulated number of comparisons numberOfComparisons = numberOfComparisons.add(BigInteger.valueOf(freq) .multiply(blockSize.multiply(blockSize.subtract(BigInteger.ONE)).shiftLeft(1))); CC = totalSizeOfBlocks.doubleValue() / numberOfComparisons.doubleValue(); Tuple2 st = new Tuple2<>(blockSize.intValue(), CC); statistics.add(st); } int optimalBlockSize = statistics.get(statistics.size() - 1)._1;// lastBlockSize; double eps = 1d; //smoothing factor /* * find minimum difference for every adjacent pair i,i-1 the minimum difference * represents the optimal blockSize */ for (int i = statistics.size() - 1; i >= 1; i--) { if (Math.abs(statistics.get(i)._2 - statistics.get(i - 1)._2) < eps) { eps = Math.abs(statistics.get(i)._2 - statistics.get(i - 1)._2); optimalBlockSize = statistics.get(i)._1; } } return optimalBlockSize; } //cut blocks basing on number of elements public static JavaRDD blockPurging(JavaRDD blocks) { int optimalBlockSize = getOptimalBlockSize(blocks); System.out.println("optimalBlockSize = " + optimalBlockSize); return blocks.filter(b -> b.getElements().size() < optimalBlockSize); } //cut blocks basing on number of comparisons public static JavaRDD blockPurging2(JavaRDD blocks) { double optimalComparisonNumber = getOptimalComparisonNumber(blocks); System.out.println("optimalComparisonNumber = " + optimalComparisonNumber); return blocks.filter(b -> b.comparisons() < optimalComparisonNumber); } public static JavaRDD blockFiltering(JavaRDD blocks) { double RATIO = 0.85; return blocks .flatMapToPair(b -> b.getElements().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator()) .groupByKey() .mapToPair(es -> { List> b = Lists.newArrayList(es._2()); b.sort(Comparator.comparing(Tuple2::_2)); int size = b.size(); long limit = Math.round(size*RATIO); return new Tuple2<>(es._1(),b.subList(0,(int)limit)); }) .flatMapToPair(es -> es._2().stream().map(it -> new Tuple2<>(it._1(), es._1())).collect(Collectors.toList()).iterator()) .groupByKey().map(b -> new Block(b._1(), b._2())); } }