forked from D-Net/dnet-hadoop
fix: filter the blocks with size = 1
This commit is contained in:
parent
4b9fb2ffb8
commit
805de4eca1
|
@ -51,8 +51,8 @@ public class Deduper implements Serializable {
|
||||||
.map(it -> Block.from(it, a))
|
.map(it -> Block.from(it, a))
|
||||||
.collect(Collectors.toList())
|
.collect(Collectors.toList())
|
||||||
.iterator())
|
.iterator())
|
||||||
.filter(b -> b.getDocuments().size() > 1)
|
|
||||||
.mapToPair(block -> new Tuple2<>(block.getKey(), block))
|
.mapToPair(block -> new Tuple2<>(block.getKey(), block))
|
||||||
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
|
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize))
|
||||||
|
.filter(b -> b._2().getDocuments().size() > 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue