[DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id)

2021-11-04 16:16:40 +01:00 · 2021-11-04 16:16:40 +01:00 · de63d29b6f
parent d50057b2d9
commit de63d29b6f
1 changed files with 11 additions and 5 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
@ -164,12 +164,18 @@ object SparkProcessMAG {
      .write.mode(SaveMode.Overwrite)
      .save(s"$workingPath/mag_publication")
-
+    spark.read.load(s"$workingPath/mag_publication").as[Publication]
-    val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
+      .filter(p => p.getId == null)
-      .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
+      .groupByKey(p => p.getId)
      .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
      .map(_._2)
      .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
-    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
+//    val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
 //      .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
 //    .map(_._2)
 //
 //    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
  }
 }