From ee1fcb672be74aa2cf95a71f68f16737612ce08e Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 13 Mar 2024 09:46:31 +0100 Subject: [PATCH] code refactor --- .../collection/crossref/Crossref2Oaf.scala | 2 +- .../dhp/collection/mag/MagUtility.scala | 3 ++- .../mag/SparkCreateMagDenormalizedTable.scala | 19 +++++++++++++++++-- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index 6ad28e8578..420943ca14 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -249,7 +249,7 @@ case object Crossref2Oaf { val subtitles = for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield structuredProperty( title, - ModelConstants.SUBTITLE_QUALIFIER, null) + ModelConstants.SUBTITLE_QUALIFIER, null) result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava) // DESCRIPTION diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala index e65e8e0203..33f19947ac 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala @@ -52,7 +52,8 @@ case class MAGPaper( // List of authors authors: Option[List[MAGAuthor]], // List of Fields of Study - fos: Option[List[MAGFieldOfStudy]] + fos: Option[List[MAGFieldOfStudy]], + urls: Option[List[String]] ) case class MAGAuthor( diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkCreateMagDenormalizedTable.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkCreateMagDenormalizedTable.scala index 73fdbe1f7d..cf6c9967c4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkCreateMagDenormalizedTable.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkCreateMagDenormalizedTable.scala @@ -146,7 +146,7 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String], $"Publisher".as("journalPublisher"), $"Webpage".as("journalWebpage") ) - step3 + val step4 = step3 .join(journals, step3("JournalId") === journals("JournalId"), "left") .select( step3("*"), @@ -155,6 +155,20 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String], journals("journalPublisher"), journals("journalWebpage") ) + .cache + step4.count() + + val paper_urls = MagUtility + .loadMagEntity(spark, "PaperUrls", magBasePath) + .groupBy("PaperId") + .agg(slice(collect_set("SourceUrl"), 1, 6).alias("urls")) + .cache + + paper_urls.count + + step4 + .join(paper_urls, step4("PaperId") === paper_urls("PaperId")) + .select(step4("*"), paper_urls("urls")) .select( $"PaperId".as("paperId"), $"Rank".as("rank"), @@ -192,7 +206,8 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String], $"journalName".as("journalName"), $"journalIssn".as("journalIssn"), $"journalPublisher".as("journalPublisher"), - $"journalWebpage".as("journalWebpage") + $"journalWebpage".as("journalWebpage"), + $"urls" ) .write .mode("OverWrite")