63 lines
2.6 KiB
Scala
63 lines
2.6 KiB
Scala
package eu.dnetlib.dhp.provision
|
|
|
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
|
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixResource}
|
|
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
|
import eu.dnetlib.dhp.schema.oaf.Relation
|
|
import org.apache.commons.io.IOUtils
|
|
import org.apache.spark.SparkConf
|
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
|
|
|
object SparkGenerateScholixIndex {
|
|
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholixIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")))
|
|
parser.parseArgument(args)
|
|
val conf = new SparkConf
|
|
conf.set("spark.sql.shuffle.partitions", "4000")
|
|
val spark = SparkSession.builder.config(conf).appName(SparkGenerateScholixIndex.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
|
|
|
val graphPath = parser.get("graphPath")
|
|
val workingDirPath = parser.get("workingDirPath")
|
|
|
|
|
|
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
|
implicit val relEncoder:Encoder[Relation] = Encoders.kryo[Relation]
|
|
implicit val scholixEncoder:Encoder[Scholix] = Encoders.kryo[Scholix]
|
|
implicit val tupleScholix:Encoder[(String,Scholix)]=Encoders.tuple(Encoders.STRING, scholixEncoder)
|
|
|
|
|
|
val scholixSummary:Dataset[(String,ScholixSummary)] = spark.read.load(s"$workingDirPath/summary").as[ScholixSummary]
|
|
.map(s => (s.getId, s))(Encoders.tuple(Encoders.STRING, summaryEncoder))
|
|
val sourceRelations:Dataset[(String,Relation)]= spark.read.load(s"$graphPath/relation").as[Relation]
|
|
.map(r => (r.getSource,r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
|
|
|
scholixSummary.joinWith(sourceRelations, scholixSummary("_1").equalTo(sourceRelations("_1")), "inner")
|
|
.map(r=> {
|
|
val summary = r._1._2
|
|
val relation = r._2._2
|
|
|
|
(relation.getTarget, Scholix.generateScholixWithSource(summary,relation))
|
|
|
|
}).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_source")
|
|
|
|
val sTarget:Dataset[(String,Scholix)] = spark.read.load(s"$workingDirPath/scholix_source").as[(String, Scholix)]
|
|
|
|
sTarget.joinWith(scholixSummary, sTarget("_1").equalTo(scholixSummary("_1")), "inner").map(i => {
|
|
val summary = i._2._2
|
|
val scholix = i._1._2
|
|
|
|
val scholixResource = ScholixResource.fromSummary(summary)
|
|
scholix.setTarget(scholixResource)
|
|
scholix.generateIdentifier()
|
|
scholix.generatelinkPublisher()
|
|
scholix
|
|
}).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix")
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|