2020-05-13 10:38:04 +02:00
|
|
|
package eu.dnetlib.doiboost.mag
|
|
|
|
|
|
|
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
2020-05-28 09:57:46 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
2021-06-29 18:51:11 +02:00
|
|
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
2020-05-13 10:38:04 +02:00
|
|
|
import org.apache.commons.io.IOUtils
|
|
|
|
import org.apache.spark.SparkConf
|
2021-12-06 14:24:03 +01:00
|
|
|
import org.apache.spark.sql.functions.{col, collect_list, struct}
|
2020-05-28 09:57:46 +02:00
|
|
|
import org.apache.spark.sql._
|
|
|
|
import org.slf4j.{Logger, LoggerFactory}
|
2020-05-19 09:24:45 +02:00
|
|
|
import scala.collection.JavaConverters._
|
2022-01-11 16:57:48 +01:00
|
|
|
|
2021-01-04 17:37:08 +01:00
|
|
|
object SparkProcessMAG {
|
2021-06-29 18:51:11 +02:00
|
|
|
|
2021-12-06 14:24:03 +01:00
|
|
|
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
|
2021-06-29 18:51:11 +02:00
|
|
|
d.where(col("Doi").isNotNull)
|
|
|
|
.groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)
|
2022-01-12 09:40:28 +01:00
|
|
|
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
|
2021-06-29 18:51:11 +02:00
|
|
|
.map(_._2)(Encoders.product[MagPapers])
|
|
|
|
.map(mp => {
|
2022-01-11 16:57:48 +01:00
|
|
|
MagPapers(
|
|
|
|
mp.PaperId,
|
|
|
|
mp.Rank,
|
|
|
|
DoiBoostMappingUtil.normalizeDoi(mp.Doi),
|
|
|
|
mp.DocType,
|
|
|
|
mp.PaperTitle,
|
|
|
|
mp.OriginalTitle,
|
|
|
|
mp.BookTitle,
|
|
|
|
mp.Year,
|
|
|
|
mp.Date,
|
|
|
|
mp.Publisher: String,
|
|
|
|
mp.JournalId,
|
|
|
|
mp.ConferenceSeriesId,
|
|
|
|
mp.ConferenceInstanceId,
|
|
|
|
mp.Volume,
|
|
|
|
mp.Issue,
|
|
|
|
mp.FirstPage,
|
|
|
|
mp.LastPage,
|
|
|
|
mp.ReferenceCount,
|
|
|
|
mp.CitationCount,
|
|
|
|
mp.EstimatedCitation,
|
|
|
|
mp.OriginalVenue,
|
|
|
|
mp.FamilyId,
|
|
|
|
mp.CreatedDate
|
|
|
|
)
|
2021-06-29 18:51:11 +02:00
|
|
|
})(Encoders.product[MagPapers])
|
|
|
|
}
|
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
def main(args: Array[String]): Unit = {
|
|
|
|
|
|
|
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
|
|
|
val conf: SparkConf = new SparkConf()
|
2022-01-11 16:57:48 +01:00
|
|
|
val parser = new ArgumentApplicationParser(
|
|
|
|
IOUtils.toString(
|
|
|
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")
|
|
|
|
)
|
|
|
|
)
|
2020-05-13 10:38:04 +02:00
|
|
|
parser.parseArgument(args)
|
|
|
|
val spark: SparkSession =
|
|
|
|
SparkSession
|
|
|
|
.builder()
|
|
|
|
.config(conf)
|
|
|
|
.appName(getClass.getSimpleName)
|
2022-01-11 16:57:48 +01:00
|
|
|
.master(parser.get("master"))
|
|
|
|
.getOrCreate()
|
2020-05-19 09:24:45 +02:00
|
|
|
|
|
|
|
val sourcePath = parser.get("sourcePath")
|
2020-12-07 19:59:33 +01:00
|
|
|
val workingPath = parser.get("workingPath")
|
|
|
|
val targetPath = parser.get("targetPath")
|
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
import spark.implicits._
|
2022-01-11 16:57:48 +01:00
|
|
|
implicit val mapEncoderPubs: Encoder[Publication] =
|
|
|
|
org.apache.spark.sql.Encoders.kryo[Publication]
|
|
|
|
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] =
|
|
|
|
Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
2020-05-20 08:14:03 +02:00
|
|
|
|
2021-06-29 18:51:11 +02:00
|
|
|
logger.info("Phase 1) make uninue DOI in Papers:")
|
2020-12-07 19:59:33 +01:00
|
|
|
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
|
2020-05-20 17:05:46 +02:00
|
|
|
|
|
|
|
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
2021-06-29 18:51:11 +02:00
|
|
|
val distinctPaper: Dataset[MagPapers] = getDistinctResults(d)
|
2020-12-07 19:59:33 +01:00
|
|
|
|
|
|
|
distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")
|
2020-05-28 09:57:46 +02:00
|
|
|
|
2020-06-25 10:48:15 +02:00
|
|
|
logger.info("Phase 0) Enrich Publication with description")
|
2020-12-07 19:59:33 +01:00
|
|
|
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
2022-01-11 16:57:48 +01:00
|
|
|
pa.map(ConversionUtil.transformPaperAbstract)
|
|
|
|
.write
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
.save(s"$workingPath/PaperAbstract")
|
2020-05-20 17:05:46 +02:00
|
|
|
|
|
|
|
logger.info("Phase 3) Group Author by PaperId")
|
|
|
|
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
|
|
|
|
|
|
|
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
|
2022-01-11 16:57:48 +01:00
|
|
|
val paperAuthorAffiliation =
|
|
|
|
spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
|
|
|
|
|
|
|
paperAuthorAffiliation
|
|
|
|
.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
|
|
|
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) =>
|
|
|
|
(a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber))
|
|
|
|
}
|
2020-05-20 17:05:46 +02:00
|
|
|
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
|
|
|
|
.map(s => {
|
|
|
|
val mpa = s._1._2
|
|
|
|
val af = s._2
|
|
|
|
if (af != null) {
|
2021-03-11 11:32:32 +01:00
|
|
|
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber)
|
2020-05-20 17:05:46 +02:00
|
|
|
} else
|
|
|
|
mpa
|
2022-01-11 16:57:48 +01:00
|
|
|
})
|
|
|
|
.groupBy("PaperId")
|
|
|
|
.agg(collect_list(struct($"author", $"affiliation", $"sequenceNumber")).as("authors"))
|
|
|
|
.write
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
.save(s"$workingPath/merge_step_1_paper_authors")
|
2020-05-20 17:05:46 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
logger.info(
|
|
|
|
"Phase 4) create First Version of publication Entity with Paper Journal and Authors"
|
|
|
|
)
|
2020-05-20 17:05:46 +02:00
|
|
|
|
|
|
|
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val papers = spark.read.load(s"$workingPath/Papers_distinct").as[MagPapers]
|
2020-05-20 17:05:46 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val paperWithAuthors =
|
|
|
|
spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
2020-05-20 17:05:46 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val firstJoin =
|
|
|
|
papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
|
|
|
firstJoin
|
|
|
|
.joinWith(
|
|
|
|
paperWithAuthors,
|
|
|
|
firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")),
|
|
|
|
"left"
|
|
|
|
)
|
2020-05-28 09:57:46 +02:00
|
|
|
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
2022-01-11 16:57:48 +01:00
|
|
|
.write
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
.save(s"$workingPath/merge_step_2")
|
2020-05-20 17:05:46 +02:00
|
|
|
|
2020-05-28 09:57:46 +02:00
|
|
|
var magPubs: Dataset[(String, Publication)] =
|
2022-01-11 16:57:48 +01:00
|
|
|
spark.read
|
|
|
|
.load(s"$workingPath/merge_step_2")
|
|
|
|
.as[Publication]
|
|
|
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
|
|
|
.as[(String, Publication)]
|
|
|
|
|
|
|
|
val conference = spark.read
|
|
|
|
.load(s"$sourcePath/ConferenceInstances")
|
|
|
|
.select(
|
|
|
|
$"ConferenceInstanceId".as("ci"),
|
|
|
|
$"DisplayName",
|
|
|
|
$"Location",
|
|
|
|
$"StartDate",
|
|
|
|
$"EndDate"
|
|
|
|
)
|
|
|
|
val conferenceInstance = conference
|
|
|
|
.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
|
|
|
|
.select(
|
|
|
|
$"_1.ci",
|
|
|
|
$"_1.DisplayName",
|
|
|
|
$"_1.Location",
|
|
|
|
$"_1.StartDate",
|
|
|
|
$"_1.EndDate",
|
|
|
|
$"_2.PaperId"
|
|
|
|
)
|
|
|
|
.as[MagConferenceInstance]
|
|
|
|
|
|
|
|
magPubs
|
|
|
|
.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
|
2020-05-28 09:57:46 +02:00
|
|
|
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
|
|
|
.write
|
|
|
|
.mode(SaveMode.Overwrite)
|
2021-11-03 15:51:26 +01:00
|
|
|
.save(s"$workingPath/merge_step_3")
|
2020-05-20 17:05:46 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val paperAbstract = spark.read.load(s"$workingPath/PaperAbstract").as[MagPaperAbstract]
|
2020-05-20 17:05:46 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
magPubs = spark.read
|
|
|
|
.load(s"$workingPath/merge_step_3")
|
|
|
|
.as[Publication]
|
|
|
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
|
|
|
.as[(String, Publication)]
|
2020-05-20 17:05:46 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
magPubs
|
|
|
|
.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
|
|
|
.map(item => ConversionUtil.updatePubsWithDescription(item))
|
|
|
|
.write
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
.save(s"$workingPath/merge_step_4")
|
2020-05-20 08:14:03 +02:00
|
|
|
|
|
|
|
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
magPubs = spark.read
|
|
|
|
.load(s"$workingPath/merge_step_4")
|
|
|
|
.as[Publication]
|
|
|
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))
|
|
|
|
.as[(String, Publication)]
|
2020-05-20 08:14:03 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val fos = spark.read
|
|
|
|
.load(s"$sourcePath/FieldsOfStudy")
|
|
|
|
.select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
2020-05-20 08:14:03 +02:00
|
|
|
|
|
|
|
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
val paperField = pfos
|
|
|
|
.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
|
2020-05-20 08:14:03 +02:00
|
|
|
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
|
2022-01-11 16:57:48 +01:00
|
|
|
.groupBy($"PaperId")
|
|
|
|
.agg(
|
|
|
|
collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score"))
|
|
|
|
.as("subjects")
|
|
|
|
)
|
2020-05-20 08:14:03 +02:00
|
|
|
.as[MagFieldOfStudy]
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
magPubs
|
|
|
|
.joinWith(
|
|
|
|
paperField,
|
|
|
|
col("_1")
|
|
|
|
.equalTo(paperField("PaperId")),
|
|
|
|
"left"
|
|
|
|
)
|
2020-05-28 09:57:46 +02:00
|
|
|
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
2022-01-11 16:57:48 +01:00
|
|
|
.write
|
|
|
|
.mode(SaveMode.Overwrite)
|
2020-12-07 19:59:33 +01:00
|
|
|
.save(s"$workingPath/mag_publication")
|
2020-05-28 09:57:46 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
spark.read
|
|
|
|
.load(s"$workingPath/mag_publication")
|
|
|
|
.as[Publication]
|
2021-11-25 12:10:45 +01:00
|
|
|
.filter(p => p.getId != null)
|
2021-11-04 16:16:40 +01:00
|
|
|
.groupByKey(p => p.getId)
|
2021-12-06 14:24:03 +01:00
|
|
|
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
|
2021-11-04 16:16:40 +01:00
|
|
|
.map(_._2)
|
2022-01-11 16:57:48 +01:00
|
|
|
.write
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
.save(s"$targetPath/magPublication")
|
2021-11-04 16:24:07 +01:00
|
|
|
|
2020-05-13 10:38:04 +02:00
|
|
|
}
|
|
|
|
}
|