forked from D-Net/dnet-hadoop
[DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id)
This commit is contained in:
parent
d50057b2d9
commit
de63d29b6f
|
@ -164,12 +164,18 @@ object SparkProcessMAG {
|
||||||
.write.mode(SaveMode.Overwrite)
|
.write.mode(SaveMode.Overwrite)
|
||||||
.save(s"$workingPath/mag_publication")
|
.save(s"$workingPath/mag_publication")
|
||||||
|
|
||||||
|
spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||||
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
.filter(p => p.getId == null)
|
||||||
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
.groupByKey(p => p.getId)
|
||||||
|
.reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||||
.map(_._2)
|
.map(_._2)
|
||||||
|
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||||
|
|
||||||
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
// val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||||
|
// .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||||
|
// .map(_._2)
|
||||||
|
//
|
||||||
|
// spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue