From bc12e9819e9bffcbc2e09c5bea6df82cc2807c83 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 7 Jun 2021 16:37:01 +0200 Subject: [PATCH 1/4] Aggiornare 'dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala' The change is to fix the issue that arises when the same work appears more than once on the same ORCID profile. The change avoid to replicate the association doi -> author when the orcid id is already associated to the doi. --- .../dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala index 64be5e79a..af1d4ac37 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala @@ -15,6 +15,12 @@ import org.slf4j.{Logger, LoggerFactory} object SparkConvertORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) + def fixORCIDItem(item :ORCIDItem):ORCIDItem = { + item.authors = item.authors.groupBy(_.oid).map(_._2.head) + item + } + + def run(spark:SparkSession,sourcePath:String,workingPath:String, targetPath:String):Unit = { import spark.implicits._ implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] @@ -37,7 +43,8 @@ object SparkConvertORCIDToOAF { val author = i._2 (doi, author) }).groupBy(col("_1").alias("doi")) - .agg(collect_list(col("_2")).alias("authors")) + .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem] + .map(s => fixORCIDItem(s)) .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor") val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] From f33521d3386a1bfbca8874772358aead03e2915b Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 7 Jun 2021 17:27:07 +0200 Subject: [PATCH 2/4] Aggiornare 'dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala' to be able to replace the aboject assigned to author val has been replaced by var --- .../java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala index af1d4ac37..dbc2b0025 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala @@ -40,7 +40,7 @@ object SparkConvertORCIDToOAF { works.joinWith(authors, authors("oid").equalTo(works("oid"))) .map(i =>{ val doi = i._1.doi - val author = i._2 + var author = i._2 (doi, author) }).groupBy(col("_1").alias("doi")) .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem] From 8d2e086e489ab9d31d32ecb3dbb27ec3fd83eb41 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 7 Jun 2021 17:50:37 +0200 Subject: [PATCH 3/4] changes to avoid reassignment to val --- .../eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala index dbc2b0025..a359eb3c6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala @@ -16,8 +16,8 @@ object SparkConvertORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) def fixORCIDItem(item :ORCIDItem):ORCIDItem = { - item.authors = item.authors.groupBy(_.oid).map(_._2.head) - item + new ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList) + } From dc07f1079b1b09aecb25a218a69cb594cb0462a6 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 8 Jun 2021 12:06:10 +0200 Subject: [PATCH 4/4] added check in case the author set to be enriched is null --- .../SparkOrcidToResultFromSemRelJob.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index e90e43a20..436a53cbe 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -108,7 +108,7 @@ public class SparkOrcidToResultFromSemRelJob { return value -> { R ret = value._1(); Optional rol = Optional.ofNullable(value._2()); - if (rol.isPresent()) { + if (rol.isPresent() && Optional.ofNullable(ret.getAuthor()).isPresent()) { List toenrich_author = ret.getAuthor(); List autoritativeAuthors = rol.get().getAuthorList(); for (Author author : toenrich_author) {