forked from D-Net/dnet-hadoop
fix: filter the blocks with size = 1
This commit is contained in:
parent
4b9fb2ffb8
commit
805de4eca1
|
@ -51,8 +51,8 @@ public class Deduper implements Serializable {
|
|||
.map(it -> Block.from(it, a))
|
||||
.collect(Collectors.toList())
|
||||
.iterator())
|
||||
.filter(b -> b.getDocuments().size() > 1)
|
||||
.mapToPair(block -> new Tuple2<>(block.getKey(), block))
|
||||
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
|
||||
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize))
|
||||
.filter(b -> b._2().getDocuments().size() > 1);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue