84 lines
3.1 KiB
Scala
84 lines
3.1 KiB
Scala
package eu.dnetlib.dhp.collection.mag
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper
|
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
|
import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH
|
|
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
|
import eu.dnetlib.dhp.schema.oaf.Relation
|
|
import org.apache.spark.sql.functions.col
|
|
import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType}
|
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
|
|
class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
|
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
|
|
|
/** Here all the spark applications runs this method
|
|
* where the whole logic of the spark node is defined
|
|
*/
|
|
override def run(): Unit = {
|
|
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
|
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
|
|
|
val mapper = new ObjectMapper()
|
|
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
|
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
|
log.info(s"outputBasePath is '$outputBasePath'")
|
|
val mdstorePath = s"$outputBasePath$MDSTORE_DATA_PATH"
|
|
val magBasePath: String = parser.get("magBasePath")
|
|
log.info("found parameters magBasePath: {}", magBasePath)
|
|
convertMAG(spark, magBasePath, mdstorePath)
|
|
generateAffiliations(spark, magBasePath, mdstorePath)
|
|
reportTotalSize(mdstorePath, outputBasePath)
|
|
}
|
|
|
|
def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
|
|
import spark.implicits._
|
|
|
|
spark.read
|
|
.load(s"$magBasePath/mag_denormalized")
|
|
.as[MAGPaper]
|
|
.map(s => MagUtility.convertMAGtoOAF(s))
|
|
.filter(s => s != null)
|
|
.write
|
|
.option("compression", "gzip")
|
|
.mode(SaveMode.Overwrite)
|
|
.text(mdStorePath)
|
|
|
|
}
|
|
|
|
def generateAffiliations(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
|
|
|
|
implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])
|
|
val schema = new StructType()
|
|
.add(StructField("id", StringType))
|
|
.add(StructField("originalId", ArrayType(StringType)))
|
|
val generatedMag =
|
|
spark.read.schema(schema).json(mdStorePath).selectExpr("explode(originalId) as PaperId").distinct()
|
|
val paperAuthorAffiliations = MagUtility
|
|
.loadMagEntity(spark, "PaperAuthorAffiliations", magBasePath)
|
|
.where(col("AffiliationId").isNotNull)
|
|
.select("PaperId", "AffiliationId")
|
|
.distinct
|
|
paperAuthorAffiliations
|
|
.join(generatedMag, paperAuthorAffiliations("PaperId") === generatedMag("PaperId"), "leftsemi")
|
|
.flatMap(r => MagUtility.generateAffiliationRelations(r))
|
|
.write
|
|
.option("compression", "gzip")
|
|
.mode(SaveMode.Append)
|
|
.json(mdStorePath)
|
|
|
|
}
|
|
}
|
|
|
|
object SparkMAGtoOAF {
|
|
|
|
val log: Logger = LoggerFactory.getLogger(SparkMAGtoOAF.getClass)
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
new SparkMAGtoOAF("/eu/dnetlib/dhp/collection/mag/convert_MAG_to_OAF_properties.json", args, log)
|
|
.initialize()
|
|
.run()
|
|
}
|
|
}
|