DHP-Explorer/src/main/java/eu/dnetlib/scholix/GenerateEventDataRelations....

237 lines
8.6 KiB
Scala

package eu.dnetlib.scholix
import com.fasterxml.jackson.databind.ObjectMapper
import com.sandro.app.AbstractScalaApplication
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
import eu.dnetlib.doiboost.crossref.CrossrefDT
import org.apache.commons.cli.MissingArgumentException
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.sql.functions.count
class GenerateEventDataRelations ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
private def convertEventDataToRelations(spark:SparkSession, sourcePath:String, relation_path:String): Unit = {
implicit val relEncoders:Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
spark.read.load(sourcePath)
.select("json")
.map(r => r.getString(0))
.map(r=>DHPUtils.eventDataToRelation(r))
.write.mode(SaveMode.Overwrite).save(relation_path)
}
private def resolveRelations(spark:SparkSession, workingPath:String): Unit = {
val entityPath ="/tmp/beta_provision/scholix/entities/*"
implicit val resultEncoder:Encoder[Result]= Encoders.kryo[Result]
implicit val relEncoders:Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val df =spark.read.load(entityPath).as[Result]
df.filter(r =>r != null && r.getDataInfo!= null && false == r.getDataInfo.getDeletedbyinference)
.flatMap(r =>DHPUtils.extractPidMap(r))
.write.mode(SaveMode.Overwrite)
.save(s"$workingPath/pidMap")
val pidMap: Dataset[(String, String)] = spark.read.load(s"$workingPath/pidMap").as[(String, String)]
val unresolvedSourceRelation: Dataset[(String,Relation)] = spark.read.load(s"$workingPath/relations").as[Relation].map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING,relEncoders))
unresolvedSourceRelation.joinWith(pidMap, unresolvedSourceRelation("_1").equalTo(pidMap("_2")), "leftouter")
.map(k =>{
if (k._2 == null)
null
else {
val rel = k._1._2
val pid = k._2._1
rel.setSource(pid)
rel
}
}).as[Relation].filter(r => !r.getSource.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingPath/relSourceResolved")
val rsolved: Dataset[(String,Relation)] =spark.read.load(s"$workingPath/relSourceResolved").as[Relation].map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING,relEncoders))
rsolved.joinWith(pidMap, rsolved("_1").equalTo(pidMap("_2")), "leftouter")
.map(k => {
if (k._2 == null)
null
else {
val rel = k._1._2
val pid = k._2._1
rel.setTarget(pid)
rel
}
}).as[Relation].filter(r => !r.getTarget.startsWith("unresolved")).flatMap(r=>DHPUtils.createInverseRelationships(r)).write.mode(SaveMode.Overwrite).save(s"$workingPath/relResolved")
val totRels = unresolvedSourceRelation.count()
val resolved = spark.read.load(s"$workingPath/relResolved").as[Relation]
val totResolved = resolved.count()
println(s"RESOLVED $totResolved/$totRels")
}
private def serializeScholix(spark:SparkSession, workingPath:String):Unit = {
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
val scholix = spark.read.load(s"$workingPath/scholix").as[Scholix]
val mapper = new ObjectMapper()
import spark.implicits._
scholix.map(s => mapper.writeValueAsString(s)).write.mode(SaveMode.Overwrite).text(s"$workingPath/scholix_json")
}
private def checkRelations(spark:SparkSession, workingPath:String, rel_path:String): Unit = {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val relEncoders: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val rels = spark.read.load(rel_path)
val scholix = spark.read.load(s"$workingPath/scholix").as[Scholix]
println(scholix.count())
scholix.map(s => s.getRelationship.getName).groupBy("value").agg(count("value").alias("Total")).show()
println(rels.count())
println( rels.select("json").map(r=>DHPUtils.extractIdRel(r.getString(0))).distinct().count())
}
private def createScholix(spark:SparkSession, workingPath:String, summaryPath:String):Unit = {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val relEncoders: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val summaryDS =spark.read.load("/tmp/beta_provision/scholix/provision/summaries").as[ScholixSummary]
.map(s => (s.getId,s))(Encoders.tuple(Encoders.STRING, summaryEncoder))
val relationDS = spark.read.load(s"$workingPath/relResolved").as[Relation].map(r => (r.getSource,r))(Encoders.tuple(Encoders.STRING, relEncoders))
relationDS
.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Relation), (String, ScholixSummary)) =>
if (input._1 != null && input._2 != null) {
val rel: Relation = input._1._2
val source: ScholixSummary = input._2._2
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
} else null
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
.filter(r => r != null)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix_from_source")
val scholixSource: Dataset[(String, Scholix)] = spark.read
.load(s"$workingPath/scholix_from_source")
.as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
scholixSource
.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
if (input._2 == null) {
null
} else {
val s: Scholix = input._1._2
val target: ScholixSummary = input._2._2
ScholixUtils.generateCompleteScholix(s, target)
}
}
.filter(s => s != null)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix_one_verse")
val scholix_o_v: Dataset[Scholix] =
spark.read.load(s"$workingPath/scholix_one_verse").as[Scholix]
scholix_o_v
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
.as[Scholix]
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.groupByKey(_._1)
.agg(ScholixUtils.scholixAggregator.toColumn)
.map(s => s._2)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix")
val scholix_final: Dataset[Scholix] = spark.read.load(s"$workingPath/scholix").as[Scholix]
println(scholix_final.count())
}
private def checkCrossrefDOI(spark:SparkSession):Unit ={
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
import spark.implicits._
// val df = spark.read.load("/data/doiboost/input/crossref/crossref_ds").as[CrossrefDT]
// val tot = df.filter(d=> d.doi.equalsIgnoreCase("10.1107/s2052252521010563/yc50352sup3.hkl")).count()
// println(s"Found $tot")
val tot2 = spark.read.text("/tmp/beta_provision/graph/19_graph_blacklisted/*").as[String].filter(s=> s.contains("10.1107/s2052252521010563/yc50352sup3.hkl")).count()
println(s"Found in the final graph $tot2")
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val source_path: String = argumentMap.get("source_path").orNull
val working_path : String = argumentMap.get("working_path").orNull
if (source_path == null) throw new MissingArgumentException("Missing argument path")
if (working_path == null) throw new MissingArgumentException("Missing argument path")
// convertEventDataToRelations(spark, source_path, s"$working_path/relations")
// resolveRelations(spark, working_path )
// checkRelations(spark, working_path, source_path)
//createScholix(spark, working_path, "/tmp/beta_provision/scholix/provision/summaries")
// serializeScholix(spark, working_path)
checkCrossrefDOI(spark)
spark.close()
}
}
object GenerateEventDataRelations {
val log: Logger = LoggerFactory.getLogger(getClass.getName)
def main(args: Array[String]): Unit = {
new GenerateEventDataRelations(args,log).initialize().run()
}
}