forked from D-Net/dnet-hadoop
[DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id)
This commit is contained in:
parent
d50057b2d9
commit
de63d29b6f
|
@ -164,12 +164,18 @@ object SparkProcessMAG {
|
|||
.write.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/mag_publication")
|
||||
|
||||
|
||||
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||
spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||
.filter(p => p.getId == null)
|
||||
.groupByKey(p => p.getId)
|
||||
.reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||
.map(_._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||
|
||||
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||
// val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||
// .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||
// .map(_._2)
|
||||
//
|
||||
// spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue