Hosted By Map - added new aggregator to get just one result per datasource id

This commit is contained in:
Miriam Baglioni 2021-08-04 10:13:30 +02:00
parent 9831725073
commit a7bf314fd2
1 changed files with 36 additions and 0 deletions

View File

@ -103,4 +103,40 @@ object Aggregators {
transformedData
}
def datasourceToSingleIdAggregator: TypedColumn[EntityInfo, EntityInfo] = new Aggregator[EntityInfo, EntityInfo, EntityInfo]{
override def zero: EntityInfo = EntityInfo.newInstance("","","")
override def reduce(b: EntityInfo, a:EntityInfo): EntityInfo = {
return merge(b, a)
}
override def merge(b1: EntityInfo, b2: EntityInfo): EntityInfo = {
if (b1 == null){
return b2
}
if(b2 == null){
return b1
}
if(!b1.getHb_id.equals("")){
return b1
}
b2
}
override def finish(reduction: EntityInfo): EntityInfo = reduction
override def bufferEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
override def outputEncoder: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
}.toColumn
def datasourceToSingleId(df:Dataset[EntityInfo]): Dataset[EntityInfo] = {
val transformedData : Dataset[EntityInfo] = df
.groupByKey(_.getHb_id)(Encoders.STRING)
.agg(Aggregators.datasourceToSingleIdAggregator)
.map{
case (id:String , res: EntityInfo) => res
}(Encoders.bean(classOf[EntityInfo]))
transformedData
}
}