dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala

package eu.dnetlib.dhp.sx.graph

import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.{col, count}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}

import scala.util.Try

object SparkCreateScholix {

  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(
      IOUtils.toString(
        getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")
      )
    )
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master"))
        .getOrCreate()

    val relationPath = parser.get("relationPath")
    log.info(s"relationPath  -> $relationPath")
    val summaryPath = parser.get("summaryPath")
    log.info(s"summaryPath  -> $summaryPath")
    val targetPath = parser.get("targetPath")
    log.info(s"targetPath  -> $targetPath")
    val dumpCitations = Try(parser.get("dumpCitations").toBoolean).getOrElse(false)
    log.info(s"dumpCitations  -> $dumpCitations")

    implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])
    implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.bean(classOf[ScholixSummary])
    implicit val resourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
    implicit val scholixEncoder: Encoder[Scholix] = Encoders.bean(classOf[Scholix])
    import spark.implicits._
    spark.sparkContext.setLogLevel("WARN")

    val relationDS: Dataset[Relation] = spark.read
      .schema(relEncoder.schema)
      .json(relationPath)
      .as[Relation]
      .filter(r =>
        (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase
          .contains("merge")
      )

    val summaryDS: Dataset[ScholixResource] = spark.read
      .schema(summaryEncoder.schema)
      .json(summaryPath)
      .as[ScholixSummary]
      .map(s => ScholixUtils.generateScholixResourceFromSummary(s))

    val scholixSource: Dataset[Scholix] = relationDS
      .joinWith(summaryDS, relationDS("source").equalTo(summaryDS("dnetIdentifier")), "left")
      .map { input: (Relation, ScholixResource) =>
        if (input._1 != null && input._2 != null) {
          val rel: Relation = input._1
          val source: ScholixResource = input._2
          val s = ScholixUtils.scholixFromSource(rel, source)
          s.setIdentifier(rel.getTarget)
          s
        } else null
      }(scholixEncoder)
      .filter(r => r != null)

    scholixSource
      .joinWith(summaryDS, scholixSource("identifier").equalTo(summaryDS("dnetIdentifier")), "left")
      .map { input: (Scholix, ScholixResource) =>
        if (input._2 == null) {
          null
        } else {
          val s: Scholix = input._1
          val target: ScholixResource = input._2
          ScholixUtils.generateCompleteScholix(s, target)
        }
      }
      .filter(s => s != null)
      .write
      .option("compression", "lz4")
      .mode(SaveMode.Overwrite)
      .save(s"$targetPath/scholix_one_verse")

    val scholix_o_v: Dataset[Scholix] =
      spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]

    def scholix_complete(s: Scholix): Boolean = {
      if (s == null || s.getIdentifier == null) {
        false
      } else if (s.getSource == null || s.getTarget == null) {
        false
      } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
        false
      else
        true
    }

    val scholix_final: Dataset[Scholix] = scholix_o_v
      .filter(s => scholix_complete(s))
      .groupByKey(s =>
        scala.Ordering.String
          .min(s.getSource.getDnetIdentifier, s.getTarget.getDnetIdentifier)
          .concat(s.getRelationship.getName)
          .concat(scala.Ordering.String.max(s.getSource.getDnetIdentifier, s.getTarget.getDnetIdentifier))
      )
      .flatMapGroups((id, scholixes) => {
        val s = scholixes.toList
        if (s.size == 1) Seq(s(0), ScholixUtils.createInverseScholixRelation(s(0)))
        else s
      })

    scholix_final.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")

    val stats: Dataset[(String, String, Long)] = scholix_final
      .map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))
      .groupBy("_1", "_2")
      .agg(count("_1"))
      .as[(String, String, Long)]

    stats
      .map(s =>
        RelatedEntities(
          s._1,
          if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0,
          if ("publication".equalsIgnoreCase(s._2)) s._3 else 0
        )
      )
      .groupByKey(_.id)
      .reduceGroups((a, b) =>
        RelatedEntities(
          a.id,
          a.relatedDataset + b.relatedDataset,
          a.relatedPublication + b.relatedPublication
        )
      )
      .map(_._2)
      .write
      .option("compression", "lz4")
      .mode(SaveMode.Overwrite)
      .save(s"$targetPath/related_entities")

    val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read
      .load(s"$targetPath/related_entities")
      .as[RelatedEntities]
      .filter(r => dumpCitations || r.relatedPublication > 0 || r.relatedDataset > 0)

    val summaryDS2: Dataset[ScholixSummary] = spark.read
      .schema(summaryEncoder.schema)
      .json(summaryPath)
      .as[ScholixSummary]

    relatedEntitiesDS
      .joinWith(summaryDS2, relatedEntitiesDS("id").equalTo(summaryDS("id")), "inner")
      .map { i =>
        val re = i._1
        val sum = i._2
        sum.setRelatedDatasets(re.relatedDataset)
        sum.setRelatedPublications(re.relatedPublication)
        sum
      }
      .write
      .mode(SaveMode.Overwrite)
      .option("compression", "lz4")
      .save(s"${summaryPath}_filtered")
  }
}
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00			`package eu.dnetlib.dhp.sx.graph`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser`
			`import eu.dnetlib.dhp.schema.oaf.Relation`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00			`import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary`
added scholix generation 2021-07-06 17:18:06 +02:00			`import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils`
completed workflow of generation of scholix and summaries 2021-07-07 23:10:34 +02:00			`import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00			`import org.apache.commons.io.IOUtils`
			`import org.apache.spark.SparkConf`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`import org.apache.spark.sql.functions.{col, count}`
[scala-refactor] Module dhp-graph-mapper: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 13:57:41 +01:00			`import org.apache.spark.sql._`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00			`import org.slf4j.{Logger, LoggerFactory}`

[scholix] fixed OpenCitation dump procedure 2022-08-10 17:39:29 +02:00			`import scala.util.Try`

generate first side of scholix mapping 2021-07-06 09:53:14 +02:00			`object SparkCreateScholix {`

			`def main(args: Array[String]): Unit = {`
			`val log: Logger = LoggerFactory.getLogger(getClass)`
			`val conf: SparkConf = new SparkConf()`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val parser = new ArgumentApplicationParser(`
			`IOUtils.toString(`
			`getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/create_scholix_params.json")`
			`)`
			`)`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00			`parser.parseArgument(args)`
			`val spark: SparkSession =`
			`SparkSession`
			`.builder()`
			`.config(conf)`
			`.appName(getClass.getSimpleName)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.master(parser.get("master"))`
			`.getOrCreate()`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00
			`val relationPath = parser.get("relationPath")`
			`log.info(s"relationPath -> $relationPath")`
			`val summaryPath = parser.get("summaryPath")`
			`log.info(s"summaryPath -> $summaryPath")`
			`val targetPath = parser.get("targetPath")`
			`log.info(s"targetPath -> $targetPath")`
[scholix] fixed OpenCitation dump procedure 2022-08-10 17:39:29 +02:00			`val dumpCitations = Try(parser.get("dumpCitations").toBoolean).getOrElse(false)`
			`log.info(s"dumpCitations -> $dumpCitations")`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`implicit val relEncoder: Encoder[Relation] = Encoders.bean(classOf[Relation])`
			`implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.bean(classOf[ScholixSummary])`
			`implicit val resourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])`
			`implicit val scholixEncoder: Encoder[Scholix] = Encoders.bean(classOf[Scholix])`
added scholix generation 2021-07-06 17:18:06 +02:00			`import spark.implicits._`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`spark.sparkContext.setLogLevel("WARN")`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`val relationDS: Dataset[Relation] = spark.read`
			`.schema(relEncoder.schema)`
			`.json(relationPath)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.as[Relation]`
			`.filter(r =>`
			`(r.getDataInfo == null \|\| r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase`
			`.contains("merge")`
			`)`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`val summaryDS: Dataset[ScholixResource] = spark.read`
			`.schema(summaryEncoder.schema)`
			`.json(summaryPath)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.as[ScholixSummary]`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`.map(s => ScholixUtils.generateScholixResourceFromSummary(s))`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00
minor fix 2023-08-02 12:12:20 +02:00			`val scholixSource: Dataset[Scholix] = relationDS`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`.joinWith(summaryDS, relationDS("source").equalTo(summaryDS("dnetIdentifier")), "left")`
			`.map { input: (Relation, ScholixResource) =>`
[scala-refactor] Module dhp-graph-mapper: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 13:57:41 +01:00			`if (input._1 != null && input._2 != null) {`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`val rel: Relation = input._1`
			`val source: ScholixResource = input._2`
			`val s = ScholixUtils.scholixFromSource(rel, source)`
			`s.setIdentifier(rel.getTarget)`
			`s`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`} else null`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`}(scholixEncoder)`
[scala-refactor] Module dhp-graph-mapper: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 13:57:41 +01:00			`.filter(r => r != null)`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`scholixSource`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`.joinWith(summaryDS, scholixSource("identifier").equalTo(summaryDS("dnetIdentifier")), "left")`
			`.map { input: (Scholix, ScholixResource) =>`
[scala-refactor] Module dhp-graph-mapper: Moved all scala source into src/main/scala and src/test/scala 2021-12-06 13:57:41 +01:00			`if (input._2 == null) {`
completed workflow of generation of scholix and summaries 2021-07-07 23:10:34 +02:00			`null`
			`} else {`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`val s: Scholix = input._1`
			`val target: ScholixResource = input._2`
completed workflow of generation of scholix and summaries 2021-07-07 23:10:34 +02:00			`ScholixUtils.generateCompleteScholix(s, target)`
			`}`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`}`
			`.filter(s => s != null)`
			`.write`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`.option("compression", "lz4")`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.mode(SaveMode.Overwrite)`
			`.save(s"$targetPath/scholix_one_verse")`

			`val scholix_o_v: Dataset[Scholix] =`
			`spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]`

minor fix 2023-08-02 12:12:20 +02:00			`def scholix_complete(s: Scholix): Boolean = {`
			`if (s == null \|\| s.getIdentifier == null) {`
			`false`
			`} else if (s.getSource == null \|\| s.getTarget == null) {`
			`false`
			`} else if (s.getLinkprovider == null \|\| s.getLinkprovider.isEmpty)`
			`false`
			`else`
			`true`
			`}`

			`val scholix_final: Dataset[Scholix] = scholix_o_v`
			`.filter(s => scholix_complete(s))`
			`.groupByKey(s =>`
			`scala.Ordering.String`
			`.min(s.getSource.getDnetIdentifier, s.getTarget.getDnetIdentifier)`
			`.concat(s.getRelationship.getName)`
			`.concat(scala.Ordering.String.max(s.getSource.getDnetIdentifier, s.getTarget.getDnetIdentifier))`
			`)`
			`.flatMapGroups((id, scholixes) => {`
			`val s = scholixes.toList`
			`if (s.size == 1) Seq(s(0), ScholixUtils.createInverseScholixRelation(s(0)))`
			`else s`
			`})`
completed workflow of generation of scholix and summaries 2021-07-07 23:10:34 +02:00
minor fix 2023-08-02 12:12:20 +02:00			`scholix_final.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")`
completed workflow of generation of scholix and summaries 2021-07-07 23:10:34 +02:00
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`val stats: Dataset[(String, String, Long)] = scholix_final`
			`.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType))`
			`.groupBy("_1", "_2")`
			`.agg(count("_1"))`
			`.as[(String, String, Long)]`
completed workflow of generation of scholix and summaries 2021-07-07 23:10:34 +02:00
			`stats`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.map(s =>`
			`RelatedEntities(`
			`s._1,`
			`if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0,`
			`if ("publication".equalsIgnoreCase(s._2)) s._3 else 0`
			`)`
			`)`
completed workflow of generation of scholix and summaries 2021-07-07 23:10:34 +02:00			`.groupByKey(_.id)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.reduceGroups((a, b) =>`
			`RelatedEntities(`
			`a.id,`
			`a.relatedDataset + b.relatedDataset,`
			`a.relatedPublication + b.relatedPublication`
			`)`
			`)`
completed workflow of generation of scholix and summaries 2021-07-07 23:10:34 +02:00			`.map(_._2)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.write`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`.option("compression", "lz4")`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.mode(SaveMode.Overwrite)`
			`.save(s"$targetPath/related_entities")`

			`val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read`
			`.load(s"$targetPath/related_entities")`
			`.as[RelatedEntities]`
[scholix] fixed OpenCitation dump procedure 2022-08-10 17:39:29 +02:00			`.filter(r => dumpCitations \|\| r.relatedPublication > 0 \|\| r.relatedDataset > 0)`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`val summaryDS2: Dataset[ScholixSummary] = spark.read`
			`.schema(summaryEncoder.schema)`
			`.json(summaryPath)`
			`.as[ScholixSummary]`

scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`relatedEntitiesDS`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`.joinWith(summaryDS2, relatedEntitiesDS("id").equalTo(summaryDS("id")), "inner")`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.map { i =>`
			`val re = i._1`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`val sum = i._2`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`sum.setRelatedDatasets(re.relatedDataset)`
			`sum.setRelatedPublications(re.relatedPublication)`
			`sum`
			`}`
			`.write`
			`.mode(SaveMode.Overwrite)`
improved scholix generation using bean 2023-07-19 16:53:28 +02:00			`.option("compression", "lz4")`
scalafmt: code formatting 2022-01-11 16:57:48 +01:00			`.save(s"${summaryPath}_filtered")`
generate first side of scholix mapping 2021-07-06 09:53:14 +02:00			`}`
			`}`