From bc12e9819e9bffcbc2e09c5bea6df82cc2807c83 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 7 Jun 2021 16:37:01 +0200 Subject: [PATCH] Aggiornare 'dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala' The change is to fix the issue that arises when the same work appears more than once on the same ORCID profile. The change avoid to replicate the association doi -> author when the orcid id is already associated to the doi. --- .../dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala index 64be5e79a..af1d4ac37 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala @@ -15,6 +15,12 @@ import org.slf4j.{Logger, LoggerFactory} object SparkConvertORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) + def fixORCIDItem(item :ORCIDItem):ORCIDItem = { + item.authors = item.authors.groupBy(_.oid).map(_._2.head) + item + } + + def run(spark:SparkSession,sourcePath:String,workingPath:String, targetPath:String):Unit = { import spark.implicits._ implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] @@ -37,7 +43,8 @@ object SparkConvertORCIDToOAF { val author = i._2 (doi, author) }).groupBy(col("_1").alias("doi")) - .agg(collect_list(col("_2")).alias("authors")) + .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem] + .map(s => fixORCIDItem(s)) .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor") val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]