[DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id)

2021-11-04 16:16:40 +01:00 · 2021-11-04 16:16:40 +01:00 · de63d29b6f
parent d50057b2d9
commit de63d29b6f
1 changed files with 11 additions and 5 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
@ -164,12 +164,18 @@ object SparkProcessMAG {
      .write.mode(SaveMode.Overwrite)
      .save(s"$workingPath/mag_publication")

-
-    val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
-      .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
+    spark.read.load(s"$workingPath/mag_publication").as[Publication]
+      .filter(p => p.getId == null)
+      .groupByKey(p => p.getId)
+      .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
      .map(_._2)
+      .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")

-    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
+//    val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
+//      .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
+//    .map(_._2)
+//
+//    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")

  }
 }