forked from D-Net/dnet-hadoop
Generate "merged" dedup id relations also for records that are filtered out by the cut parameters
This commit is contained in:
parent
1287315ffb
commit
831cc1fdde
|
@ -213,7 +213,6 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
|||
.join(pivotingData, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left")
|
||||
.withColumn("pivot", functions.first("id").over(w))
|
||||
.withColumn("position", functions.row_number().over(w))
|
||||
.filter(cut > 0 ? col("position").lt(lit(cut)) : lit(true)) // apply cut after choosing pivot
|
||||
.flatMap(
|
||||
(FlatMapFunction<Row, Tuple3<String, String, String>>) (Row r) -> {
|
||||
String id = r.getAs("id");
|
||||
|
@ -249,7 +248,9 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
|||
}
|
||||
|
||||
// add merge relations
|
||||
if (cut <=0 || r.<Integer>getAs("position") <= cut) {
|
||||
res.add(new Tuple3<>(id, pivotDedupId, pivot));
|
||||
}
|
||||
|
||||
return res.iterator();
|
||||
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING()))
|
||||
|
|
Loading…
Reference in New Issue