forked from D-Net/dnet-hadoop
filter blocks with one record only
This commit is contained in:
parent
7d6e269b40
commit
4e6f46e8fa
|
@ -51,6 +51,7 @@ public class Deduper implements Serializable {
|
|||
.map(it -> Block.from(it, a))
|
||||
.collect(Collectors.toList())
|
||||
.iterator())
|
||||
.filter(b -> b.getDocuments().size() > 1)
|
||||
.mapToPair(block -> new Tuple2<>(block.getKey(), block))
|
||||
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -100,16 +101,9 @@ public class SparkBlockStats extends AbstractSparkAction {
|
|||
});
|
||||
|
||||
// create blocks for deduplication
|
||||
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
|
||||
|
||||
JavaRDD<BlockStats> blockStats = blocks
|
||||
JavaRDD<BlockStats> blockStats = Deduper.createSortedBlocks(mapDocuments, dedupConf)
|
||||
.repartition(numPartitions)
|
||||
.map(
|
||||
b -> new BlockStats(
|
||||
b._1(),
|
||||
(long) b._2().getDocuments().size(),
|
||||
computeComparisons(
|
||||
(long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize())));
|
||||
.map(b -> asBlockStats(dedupConf, b));
|
||||
|
||||
// save the blockstats in the workingdir
|
||||
spark
|
||||
|
@ -120,4 +114,12 @@ public class SparkBlockStats extends AbstractSparkAction {
|
|||
}
|
||||
}
|
||||
|
||||
private BlockStats asBlockStats(DedupConfig dedupConf, Tuple2<String, Block> b) {
|
||||
return new BlockStats(
|
||||
b._1(),
|
||||
(long) b._2().getDocuments().size(),
|
||||
computeComparisons(
|
||||
(long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize()));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue