BrBETA_dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala

77 lines
3.2 KiB
Scala
Raw Normal View History

2021-07-06 09:53:14 +02:00
package eu.dnetlib.dhp.sx.graph
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
2021-07-06 17:18:06 +02:00
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
2021-07-06 09:53:14 +02:00
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
2021-07-06 17:18:06 +02:00
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
2021-07-06 09:53:14 +02:00
import org.slf4j.{Logger, LoggerFactory}
object SparkCreateScholix {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val relationPath = parser.get("relationPath")
log.info(s"relationPath -> $relationPath")
val summaryPath = parser.get("summaryPath")
log.info(s"summaryPath -> $summaryPath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
2021-07-06 17:18:06 +02:00
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
2021-07-06 09:53:14 +02:00
2021-07-06 17:18:06 +02:00
import spark.implicits._
2021-07-06 09:53:14 +02:00
2021-07-06 17:18:06 +02:00
val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
2021-07-06 09:53:14 +02:00
2021-07-06 17:18:06 +02:00
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
.map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, summaryEncoder))
2021-07-06 09:53:14 +02:00
2021-07-06 17:18:06 +02:00
relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Relation), (String, ScholixSummary)) =>
val rel: Relation = input._1._2
val source: ScholixSummary = input._2._2
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source")
2021-07-06 09:53:14 +02:00
2021-07-06 17:18:06 +02:00
val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
2021-07-06 09:53:14 +02:00
2021-07-06 17:18:06 +02:00
scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
val s: Scholix = input._1._2
val target: ScholixSummary = input._2._2
ScholixUtils.generateCompleteScholix(s, target)
}.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
2021-07-06 09:53:14 +02:00
2021-07-06 17:18:06 +02:00
val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
2021-07-06 09:53:14 +02:00
2021-07-06 17:18:06 +02:00
scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).groupByKey(_.getIdentifier).reduceGroups { (x, y) =>
if (x != null)
x
else
y
}.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
2021-07-06 09:53:14 +02:00
}
}