237 lines
8.6 KiB
Scala
237 lines
8.6 KiB
Scala
package eu.dnetlib.scholix
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper
|
|
import com.sandro.app.AbstractScalaApplication
|
|
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
|
|
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
|
|
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
|
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
|
import eu.dnetlib.doiboost.crossref.CrossrefDT
|
|
import org.apache.commons.cli.MissingArgumentException
|
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
import org.apache.spark.sql.functions.count
|
|
|
|
class GenerateEventDataRelations ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
|
|
|
|
|
private def convertEventDataToRelations(spark:SparkSession, sourcePath:String, relation_path:String): Unit = {
|
|
implicit val relEncoders:Encoder[Relation] = Encoders.kryo[Relation]
|
|
import spark.implicits._
|
|
|
|
spark.read.load(sourcePath)
|
|
.select("json")
|
|
.map(r => r.getString(0))
|
|
.map(r=>DHPUtils.eventDataToRelation(r))
|
|
.write.mode(SaveMode.Overwrite).save(relation_path)
|
|
}
|
|
|
|
private def resolveRelations(spark:SparkSession, workingPath:String): Unit = {
|
|
val entityPath ="/tmp/beta_provision/scholix/entities/*"
|
|
|
|
implicit val resultEncoder:Encoder[Result]= Encoders.kryo[Result]
|
|
implicit val relEncoders:Encoder[Relation] = Encoders.kryo[Relation]
|
|
import spark.implicits._
|
|
val df =spark.read.load(entityPath).as[Result]
|
|
df.filter(r =>r != null && r.getDataInfo!= null && false == r.getDataInfo.getDeletedbyinference)
|
|
.flatMap(r =>DHPUtils.extractPidMap(r))
|
|
.write.mode(SaveMode.Overwrite)
|
|
.save(s"$workingPath/pidMap")
|
|
|
|
|
|
val pidMap: Dataset[(String, String)] = spark.read.load(s"$workingPath/pidMap").as[(String, String)]
|
|
|
|
val unresolvedSourceRelation: Dataset[(String,Relation)] = spark.read.load(s"$workingPath/relations").as[Relation].map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING,relEncoders))
|
|
|
|
|
|
unresolvedSourceRelation.joinWith(pidMap, unresolvedSourceRelation("_1").equalTo(pidMap("_2")), "leftouter")
|
|
.map(k =>{
|
|
if (k._2 == null)
|
|
null
|
|
else {
|
|
val rel = k._1._2
|
|
val pid = k._2._1
|
|
rel.setSource(pid)
|
|
rel
|
|
}
|
|
}).as[Relation].filter(r => !r.getSource.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingPath/relSourceResolved")
|
|
|
|
|
|
val rsolved: Dataset[(String,Relation)] =spark.read.load(s"$workingPath/relSourceResolved").as[Relation].map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING,relEncoders))
|
|
|
|
rsolved.joinWith(pidMap, rsolved("_1").equalTo(pidMap("_2")), "leftouter")
|
|
.map(k => {
|
|
if (k._2 == null)
|
|
null
|
|
else {
|
|
val rel = k._1._2
|
|
val pid = k._2._1
|
|
rel.setTarget(pid)
|
|
rel
|
|
}
|
|
}).as[Relation].filter(r => !r.getTarget.startsWith("unresolved")).flatMap(r=>DHPUtils.createInverseRelationships(r)).write.mode(SaveMode.Overwrite).save(s"$workingPath/relResolved")
|
|
|
|
|
|
|
|
val totRels = unresolvedSourceRelation.count()
|
|
val resolved = spark.read.load(s"$workingPath/relResolved").as[Relation]
|
|
val totResolved = resolved.count()
|
|
println(s"RESOLVED $totResolved/$totRels")
|
|
}
|
|
|
|
private def serializeScholix(spark:SparkSession, workingPath:String):Unit = {
|
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
|
val scholix = spark.read.load(s"$workingPath/scholix").as[Scholix]
|
|
val mapper = new ObjectMapper()
|
|
import spark.implicits._
|
|
|
|
scholix.map(s => mapper.writeValueAsString(s)).write.mode(SaveMode.Overwrite).text(s"$workingPath/scholix_json")
|
|
}
|
|
|
|
|
|
|
|
private def checkRelations(spark:SparkSession, workingPath:String, rel_path:String): Unit = {
|
|
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
|
implicit val relEncoders: Encoder[Relation] = Encoders.kryo[Relation]
|
|
|
|
import spark.implicits._
|
|
|
|
|
|
val rels = spark.read.load(rel_path)
|
|
val scholix = spark.read.load(s"$workingPath/scholix").as[Scholix]
|
|
|
|
println(scholix.count())
|
|
|
|
scholix.map(s => s.getRelationship.getName).groupBy("value").agg(count("value").alias("Total")).show()
|
|
|
|
|
|
println(rels.count())
|
|
|
|
|
|
println( rels.select("json").map(r=>DHPUtils.extractIdRel(r.getString(0))).distinct().count())
|
|
|
|
}
|
|
|
|
|
|
private def createScholix(spark:SparkSession, workingPath:String, summaryPath:String):Unit = {
|
|
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
|
implicit val relEncoders: Encoder[Relation] = Encoders.kryo[Relation]
|
|
import spark.implicits._
|
|
|
|
|
|
val summaryDS =spark.read.load("/tmp/beta_provision/scholix/provision/summaries").as[ScholixSummary]
|
|
.map(s => (s.getId,s))(Encoders.tuple(Encoders.STRING, summaryEncoder))
|
|
val relationDS = spark.read.load(s"$workingPath/relResolved").as[Relation].map(r => (r.getSource,r))(Encoders.tuple(Encoders.STRING, relEncoders))
|
|
|
|
relationDS
|
|
.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
|
|
.map { input: ((String, Relation), (String, ScholixSummary)) =>
|
|
if (input._1 != null && input._2 != null) {
|
|
val rel: Relation = input._1._2
|
|
val source: ScholixSummary = input._2._2
|
|
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
|
|
} else null
|
|
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
|
.filter(r => r != null)
|
|
.write
|
|
.mode(SaveMode.Overwrite)
|
|
.save(s"$workingPath/scholix_from_source")
|
|
|
|
val scholixSource: Dataset[(String, Scholix)] = spark.read
|
|
.load(s"$workingPath/scholix_from_source")
|
|
.as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
|
|
|
|
scholixSource
|
|
.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
|
|
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
|
|
if (input._2 == null) {
|
|
null
|
|
} else {
|
|
val s: Scholix = input._1._2
|
|
val target: ScholixSummary = input._2._2
|
|
ScholixUtils.generateCompleteScholix(s, target)
|
|
}
|
|
}
|
|
.filter(s => s != null)
|
|
.write
|
|
.mode(SaveMode.Overwrite)
|
|
.save(s"$workingPath/scholix_one_verse")
|
|
|
|
val scholix_o_v: Dataset[Scholix] =
|
|
spark.read.load(s"$workingPath/scholix_one_verse").as[Scholix]
|
|
|
|
scholix_o_v
|
|
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
|
|
.as[Scholix]
|
|
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
|
|
.groupByKey(_._1)
|
|
.agg(ScholixUtils.scholixAggregator.toColumn)
|
|
.map(s => s._2)
|
|
.write
|
|
.mode(SaveMode.Overwrite)
|
|
.save(s"$workingPath/scholix")
|
|
|
|
val scholix_final: Dataset[Scholix] = spark.read.load(s"$workingPath/scholix").as[Scholix]
|
|
|
|
|
|
println(scholix_final.count())
|
|
}
|
|
|
|
|
|
|
|
private def checkCrossrefDOI(spark:SparkSession):Unit ={
|
|
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
|
|
import spark.implicits._
|
|
|
|
// val df = spark.read.load("/data/doiboost/input/crossref/crossref_ds").as[CrossrefDT]
|
|
// val tot = df.filter(d=> d.doi.equalsIgnoreCase("10.1107/s2052252521010563/yc50352sup3.hkl")).count()
|
|
// println(s"Found $tot")
|
|
|
|
|
|
val tot2 = spark.read.text("/tmp/beta_provision/graph/19_graph_blacklisted/*").as[String].filter(s=> s.contains("10.1107/s2052252521010563/yc50352sup3.hkl")).count()
|
|
println(s"Found in the final graph $tot2")
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** Here all the spark applications runs this method
|
|
* where the whole logic of the spark node is defined
|
|
*/
|
|
override def run(): Unit = {
|
|
val source_path: String = argumentMap.get("source_path").orNull
|
|
val working_path : String = argumentMap.get("working_path").orNull
|
|
if (source_path == null) throw new MissingArgumentException("Missing argument path")
|
|
if (working_path == null) throw new MissingArgumentException("Missing argument path")
|
|
// convertEventDataToRelations(spark, source_path, s"$working_path/relations")
|
|
// resolveRelations(spark, working_path )
|
|
// checkRelations(spark, working_path, source_path)
|
|
//createScholix(spark, working_path, "/tmp/beta_provision/scholix/provision/summaries")
|
|
// serializeScholix(spark, working_path)
|
|
|
|
checkCrossrefDOI(spark)
|
|
|
|
spark.close()
|
|
}
|
|
}
|
|
|
|
|
|
object GenerateEventDataRelations {
|
|
val log: Logger = LoggerFactory.getLogger(getClass.getName)
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
new GenerateEventDataRelations(args,log).initialize().run()
|
|
}
|
|
|
|
}
|