Draft SparkPropagateOrcidAuthors
This commit is contained in:
parent
d67f125614
commit
aeaedeed01
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.dhp.enrich.orcid
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
|
||||
import eu.dnetlib.dhp.utils.OrcidAuthor
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
class SparkPropagateOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends SparkEnrichGraphWithOrcidAuthors(propertyPath, args, log: Logger) {
|
||||
|
||||
override def createTemporaryData(graphPath: String, orcidPath: String, targetPath: String): Unit = {
|
||||
val relEnc = Encoders.bean(classOf[Relation])
|
||||
|
||||
ModelSupport.entityTypes.asScala
|
||||
.filter(e => ModelSupport.isResult(e._1))
|
||||
.foreach(e => {
|
||||
val resultType = e._1.name()
|
||||
val enc = Encoders.bean(e._2)
|
||||
|
||||
val orcidDnet = spark.read
|
||||
.load("$graphPath/$resultType")
|
||||
.as[Result]
|
||||
.map(
|
||||
result =>
|
||||
(
|
||||
result.getId,
|
||||
result.getAuthor.asScala.map(a => OrcidAuthor("extract ORCID", a.getSurname, a.getName, a.getFullname, null))
|
||||
)
|
||||
)
|
||||
.where("size(_2) > 0")
|
||||
.selectExpr("_1 as id", "_2 as orcid_authors")
|
||||
|
||||
val result =
|
||||
spark.read.schema(enc.schema).json(s"$graphPath/$resultType").selectExpr("id", "author as graph_authors")
|
||||
|
||||
val supplements = spark.read.schema(relEnc.schema).json(s"$graphPath/relation").where("relclass IN('isSupplementedBy', 'isSupplementOf')").selectExpr("source as id", "target")
|
||||
|
||||
result
|
||||
.join(supplements, Seq("id"))
|
||||
.join(orcidDnet, orcidDnet("id") === col("target"))
|
||||
.drop("target")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.parquet(s"$targetPath/${resultType}_unmatched")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
object SparkPropagateOrcidAuthors {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(SparkPropagateOrcidAuthors.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new SparkPropagateOrcidAuthors("/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json", args, log)
|
||||
.initialize()
|
||||
.run()
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue