package eu.dnetlib.dhp.oa.dedup; import java.util.Map; import java.util.stream.Collectors; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; import scala.Serializable; import scala.Tuple2; public class Deduper implements Serializable { public static JavaPairRDD computeRelations( JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); return blocks .flatMapToPair( it -> { final SparkReporter reporter = new SparkReporter(accumulators); new BlockProcessor(config) .processSortedBlock(it._1(), it._2().getDocuments(), reporter); return reporter.getRelations().iterator(); }) .mapToPair(it -> new Tuple2<>(it._1() + it._2(), it)) .reduceByKey((a, b) -> a) .mapToPair(Tuple2::_2); } public static JavaPairRDD createSortedBlocks( JavaPairRDD mapDocs, DedupConfig config) { final String of = config.getWf().getOrderField(); final int maxQueueSize = config.getWf().getQueueMaxSize(); return mapDocs // the reduce is just to be sure that we haven't document with same id .reduceByKey((a, b) -> a) .map(Tuple2::_2) // Clustering: from to List .flatMap( a -> DedupUtility .getGroupingKeys(config, a) .stream() .map(it -> Block.from(it, a)) .collect(Collectors.toList()) .iterator()) .mapToPair(block -> new Tuple2<>(block.getKey(), block)) .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)) .filter(b -> b._2().getDocuments().size() > 1); } }