forked from D-Net/dnet-hadoop
code refactor
This commit is contained in:
parent
c532831718
commit
ee1fcb672b
|
@ -249,7 +249,7 @@ case object Crossref2Oaf {
|
||||||
val subtitles =
|
val subtitles =
|
||||||
for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield structuredProperty(
|
for { JString(title) <- json \ "subtitle" if title.nonEmpty } yield structuredProperty(
|
||||||
title,
|
title,
|
||||||
ModelConstants.SUBTITLE_QUALIFIER, null)
|
ModelConstants.SUBTITLE_QUALIFIER, null)
|
||||||
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
|
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
|
||||||
|
|
||||||
// DESCRIPTION
|
// DESCRIPTION
|
||||||
|
|
|
@ -52,7 +52,8 @@ case class MAGPaper(
|
||||||
// List of authors
|
// List of authors
|
||||||
authors: Option[List[MAGAuthor]],
|
authors: Option[List[MAGAuthor]],
|
||||||
// List of Fields of Study
|
// List of Fields of Study
|
||||||
fos: Option[List[MAGFieldOfStudy]]
|
fos: Option[List[MAGFieldOfStudy]],
|
||||||
|
urls: Option[List[String]]
|
||||||
)
|
)
|
||||||
|
|
||||||
case class MAGAuthor(
|
case class MAGAuthor(
|
||||||
|
|
|
@ -146,7 +146,7 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String],
|
||||||
$"Publisher".as("journalPublisher"),
|
$"Publisher".as("journalPublisher"),
|
||||||
$"Webpage".as("journalWebpage")
|
$"Webpage".as("journalWebpage")
|
||||||
)
|
)
|
||||||
step3
|
val step4 = step3
|
||||||
.join(journals, step3("JournalId") === journals("JournalId"), "left")
|
.join(journals, step3("JournalId") === journals("JournalId"), "left")
|
||||||
.select(
|
.select(
|
||||||
step3("*"),
|
step3("*"),
|
||||||
|
@ -155,6 +155,20 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String],
|
||||||
journals("journalPublisher"),
|
journals("journalPublisher"),
|
||||||
journals("journalWebpage")
|
journals("journalWebpage")
|
||||||
)
|
)
|
||||||
|
.cache
|
||||||
|
step4.count()
|
||||||
|
|
||||||
|
val paper_urls = MagUtility
|
||||||
|
.loadMagEntity(spark, "PaperUrls", magBasePath)
|
||||||
|
.groupBy("PaperId")
|
||||||
|
.agg(slice(collect_set("SourceUrl"), 1, 6).alias("urls"))
|
||||||
|
.cache
|
||||||
|
|
||||||
|
paper_urls.count
|
||||||
|
|
||||||
|
step4
|
||||||
|
.join(paper_urls, step4("PaperId") === paper_urls("PaperId"))
|
||||||
|
.select(step4("*"), paper_urls("urls"))
|
||||||
.select(
|
.select(
|
||||||
$"PaperId".as("paperId"),
|
$"PaperId".as("paperId"),
|
||||||
$"Rank".as("rank"),
|
$"Rank".as("rank"),
|
||||||
|
@ -192,7 +206,8 @@ class SparkCreateMagDenormalizedTable(propertyPath: String, args: Array[String],
|
||||||
$"journalName".as("journalName"),
|
$"journalName".as("journalName"),
|
||||||
$"journalIssn".as("journalIssn"),
|
$"journalIssn".as("journalIssn"),
|
||||||
$"journalPublisher".as("journalPublisher"),
|
$"journalPublisher".as("journalPublisher"),
|
||||||
$"journalWebpage".as("journalWebpage")
|
$"journalWebpage".as("journalWebpage"),
|
||||||
|
$"urls"
|
||||||
)
|
)
|
||||||
.write
|
.write
|
||||||
.mode("OverWrite")
|
.mode("OverWrite")
|
||||||
|
|
Loading…
Reference in New Issue