package eu.dnetlib.scholix import com.fasterxml.jackson.databind.ObjectMapper import com.sandro.app.AbstractScalaApplication import eu.dnetlib.dhp.schema.oaf.{Relation, Result} import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils import eu.dnetlib.doiboost.crossref.CrossrefDT import org.apache.commons.cli.MissingArgumentException import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} import org.apache.spark.sql.functions.count class GenerateEventDataRelations ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { private def convertEventDataToRelations(spark:SparkSession, sourcePath:String, relation_path:String): Unit = { implicit val relEncoders:Encoder[Relation] = Encoders.kryo[Relation] import spark.implicits._ spark.read.load(sourcePath) .select("json") .map(r => r.getString(0)) .map(r=>DHPUtils.eventDataToRelation(r)) .write.mode(SaveMode.Overwrite).save(relation_path) } private def resolveRelations(spark:SparkSession, workingPath:String): Unit = { val entityPath ="/tmp/beta_provision/scholix/entities/*" implicit val resultEncoder:Encoder[Result]= Encoders.kryo[Result] implicit val relEncoders:Encoder[Relation] = Encoders.kryo[Relation] import spark.implicits._ val df =spark.read.load(entityPath).as[Result] df.filter(r =>r != null && r.getDataInfo!= null && false == r.getDataInfo.getDeletedbyinference) .flatMap(r =>DHPUtils.extractPidMap(r)) .write.mode(SaveMode.Overwrite) .save(s"$workingPath/pidMap") val pidMap: Dataset[(String, String)] = spark.read.load(s"$workingPath/pidMap").as[(String, String)] val unresolvedSourceRelation: Dataset[(String,Relation)] = spark.read.load(s"$workingPath/relations").as[Relation].map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING,relEncoders)) unresolvedSourceRelation.joinWith(pidMap, unresolvedSourceRelation("_1").equalTo(pidMap("_2")), "leftouter") .map(k =>{ if (k._2 == null) null else { val rel = k._1._2 val pid = k._2._1 rel.setSource(pid) rel } }).as[Relation].filter(r => !r.getSource.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingPath/relSourceResolved") val rsolved: Dataset[(String,Relation)] =spark.read.load(s"$workingPath/relSourceResolved").as[Relation].map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING,relEncoders)) rsolved.joinWith(pidMap, rsolved("_1").equalTo(pidMap("_2")), "leftouter") .map(k => { if (k._2 == null) null else { val rel = k._1._2 val pid = k._2._1 rel.setTarget(pid) rel } }).as[Relation].filter(r => !r.getTarget.startsWith("unresolved")).flatMap(r=>DHPUtils.createInverseRelationships(r)).write.mode(SaveMode.Overwrite).save(s"$workingPath/relResolved") val totRels = unresolvedSourceRelation.count() val resolved = spark.read.load(s"$workingPath/relResolved").as[Relation] val totResolved = resolved.count() println(s"RESOLVED $totResolved/$totRels") } private def serializeScholix(spark:SparkSession, workingPath:String):Unit = { implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] val scholix = spark.read.load(s"$workingPath/scholix").as[Scholix] val mapper = new ObjectMapper() import spark.implicits._ scholix.map(s => mapper.writeValueAsString(s)).write.mode(SaveMode.Overwrite).text(s"$workingPath/scholix_json") } private def checkRelations(spark:SparkSession, workingPath:String, rel_path:String): Unit = { implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] implicit val relEncoders: Encoder[Relation] = Encoders.kryo[Relation] import spark.implicits._ val rels = spark.read.load(rel_path) val scholix = spark.read.load(s"$workingPath/scholix").as[Scholix] println(scholix.count()) scholix.map(s => s.getRelationship.getName).groupBy("value").agg(count("value").alias("Total")).show() println(rels.count()) println( rels.select("json").map(r=>DHPUtils.extractIdRel(r.getString(0))).distinct().count()) } private def createScholix(spark:SparkSession, workingPath:String, summaryPath:String):Unit = { implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] implicit val relEncoders: Encoder[Relation] = Encoders.kryo[Relation] import spark.implicits._ val summaryDS =spark.read.load("/tmp/beta_provision/scholix/provision/summaries").as[ScholixSummary] .map(s => (s.getId,s))(Encoders.tuple(Encoders.STRING, summaryEncoder)) val relationDS = spark.read.load(s"$workingPath/relResolved").as[Relation].map(r => (r.getSource,r))(Encoders.tuple(Encoders.STRING, relEncoders)) relationDS .joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Relation), (String, ScholixSummary)) => if (input._1 != null && input._2 != null) { val rel: Relation = input._1._2 val source: ScholixSummary = input._2._2 (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) } else null }(Encoders.tuple(Encoders.STRING, scholixEncoder)) .filter(r => r != null) .write .mode(SaveMode.Overwrite) .save(s"$workingPath/scholix_from_source") val scholixSource: Dataset[(String, Scholix)] = spark.read .load(s"$workingPath/scholix_from_source") .as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) scholixSource .joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Scholix), (String, ScholixSummary)) => if (input._2 == null) { null } else { val s: Scholix = input._1._2 val target: ScholixSummary = input._2._2 ScholixUtils.generateCompleteScholix(s, target) } } .filter(s => s != null) .write .mode(SaveMode.Overwrite) .save(s"$workingPath/scholix_one_verse") val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$workingPath/scholix_one_verse").as[Scholix] scholix_o_v .flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))) .as[Scholix] .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) .groupByKey(_._1) .agg(ScholixUtils.scholixAggregator.toColumn) .map(s => s._2) .write .mode(SaveMode.Overwrite) .save(s"$workingPath/scholix") val scholix_final: Dataset[Scholix] = spark.read.load(s"$workingPath/scholix").as[Scholix] println(scholix_final.count()) } private def checkCrossrefDOI(spark:SparkSession):Unit ={ implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT] import spark.implicits._ // val df = spark.read.load("/data/doiboost/input/crossref/crossref_ds").as[CrossrefDT] // val tot = df.filter(d=> d.doi.equalsIgnoreCase("10.1107/s2052252521010563/yc50352sup3.hkl")).count() // println(s"Found $tot") val tot2 = spark.read.text("/tmp/beta_provision/graph/19_graph_blacklisted/*").as[String].filter(s=> s.contains("10.1107/s2052252521010563/yc50352sup3.hkl")).count() println(s"Found in the final graph $tot2") } /** Here all the spark applications runs this method * where the whole logic of the spark node is defined */ override def run(): Unit = { val source_path: String = argumentMap.get("source_path").orNull val working_path : String = argumentMap.get("working_path").orNull if (source_path == null) throw new MissingArgumentException("Missing argument path") if (working_path == null) throw new MissingArgumentException("Missing argument path") // convertEventDataToRelations(spark, source_path, s"$working_path/relations") // resolveRelations(spark, working_path ) // checkRelations(spark, working_path, source_path) //createScholix(spark, working_path, "/tmp/beta_provision/scholix/provision/summaries") // serializeScholix(spark, working_path) checkCrossrefDOI(spark) spark.close() } } object GenerateEventDataRelations { val log: Logger = LoggerFactory.getLogger(getClass.getName) def main(args: Array[String]): Unit = { new GenerateEventDataRelations(args,log).initialize().run() } }