1
0
Fork 0

code refactor

This commit is contained in:
Sandro La Bruzzo 2024-03-13 09:46:31 +01:00
parent c532831718
commit ee1fcb672b
3 changed files with 20 additions and 4 deletions

View File

@ -249,7 +249,7 @@ case object Crossref2Oaf {
val subtitles = val subtitles =
for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield structuredProperty( for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield structuredProperty(
title, title,
ModelConstants.SUBTITLE_QUALIFIER, null) ModelConstants.SUBTITLE_QUALIFIER, null)
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava) result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
// DESCRIPTION // DESCRIPTION

View File

@ -52,7 +52,8 @@ case class MAGPaper(
// List of authors // List of authors
authors: Option[List[MAGAuthor]], authors: Option[List[MAGAuthor]],
// List of Fields of Study // List of Fields of Study
fos: Option[List[MAGFieldOfStudy]] fos: Option[List[MAGFieldOfStudy]],
urls: Option[List[String]]
) )
case class MAGAuthor( case class MAGAuthor(

View File

@ -146,7 +146,7 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String],
$"Publisher".as("journalPublisher"), $"Publisher".as("journalPublisher"),
$"Webpage".as("journalWebpage") $"Webpage".as("journalWebpage")
) )
step3 val step4 = step3
.join(journals, step3("JournalId") === journals("JournalId"), "left") .join(journals, step3("JournalId") === journals("JournalId"), "left")
.select( .select(
step3("*"), step3("*"),
@ -155,6 +155,20 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String],
journals("journalPublisher"), journals("journalPublisher"),
journals("journalWebpage") journals("journalWebpage")
) )
.cache
step4.count()
val paper_urls = MagUtility
.loadMagEntity(spark, "PaperUrls", magBasePath)
.groupBy("PaperId")
.agg(slice(collect_set("SourceUrl"), 1, 6).alias("urls"))
.cache
paper_urls.count
step4
.join(paper_urls, step4("PaperId") === paper_urls("PaperId"))
.select(step4("*"), paper_urls("urls"))
.select( .select(
$"PaperId".as("paperId"), $"PaperId".as("paperId"),
$"Rank".as("rank"), $"Rank".as("rank"),
@ -192,7 +206,8 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String],
$"journalName".as("journalName"), $"journalName".as("journalName"),
$"journalIssn".as("journalIssn"), $"journalIssn".as("journalIssn"),
$"journalPublisher".as("journalPublisher"), $"journalPublisher".as("journalPublisher"),
$"journalWebpage".as("journalWebpage") $"journalWebpage".as("journalWebpage"),
$"urls"
) )
.write .write
.mode("OverWrite") .mode("OverWrite")