From de63d29b6f5b6cdc0a45df94f5ade5f9c4c28711 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 4 Nov 2021 16:16:40 +0100 Subject: [PATCH] [DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id) --- .../dnetlib/doiboost/mag/SparkProcessMAG.scala | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index c011cbd20..fa3be973d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -164,12 +164,18 @@ object SparkProcessMAG { .write.mode(SaveMode.Overwrite) .save(s"$workingPath/mag_publication") + spark.read.load(s"$workingPath/mag_publication").as[Publication] + .filter(p => p.getId == null) + .groupByKey(p => p.getId) + .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) + .map(_._2) + .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") - val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication] - .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) - .map(_._2) - - spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") +// val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication] +// .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) +// .map(_._2) +// +// spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") } }