[DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id)

This commit is contained in:
Miriam Baglioni 2021-11-04 16:16:40 +01:00
parent d50057b2d9
commit de63d29b6f
1 changed files with 11 additions and 5 deletions

View File

@ -164,12 +164,18 @@ object SparkProcessMAG {
.write.mode(SaveMode.Overwrite) .write.mode(SaveMode.Overwrite)
.save(s"$workingPath/mag_publication") .save(s"$workingPath/mag_publication")
spark.read.load(s"$workingPath/mag_publication").as[Publication]
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication] .filter(p => p.getId == null)
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) .groupByKey(p => p.getId)
.reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
.map(_._2) .map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") // val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
// .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
// .map(_._2)
//
// spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
} }
} }