code formatted

2024-12-18 11:17:42 +01:00 · 2024-12-18 11:17:42 +01:00 · d8124b947e
parent b7357e18b2
commit d8124b947e
2 changed files with 48 additions and 35 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/minidump/SparkCreateMiniDumpGraph.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/minidump/SparkCreateMiniDumpGraph.scala
@ -2,8 +2,7 @@ package eu.dnetlib.dhp.oa.graph.minidump

 import eu.dnetlib.dhp.application.AbstractScalaApplication
 import eu.dnetlib.dhp.schema.oaf.{Relation, StructuredProperty}
-import org.apache.spark.sql.expressions.Window
-import org.apache.spark.sql.functions.{col, from_json, monotonically_increasing_id, row_number, size}
+import org.apache.spark.sql.functions.{col, from_json}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}
@ -22,31 +21,35 @@ class SparkCreateMiniDumpGraph(propertyPath: String, args: Array[String], log: L
 //    generateMiniDump(spark, sourcePath, targetPath)
  }

-  def generateMiniDump(spark: SparkSession, sourcePath: String, pidListPath: String, targetPath:String): Unit = {
+  def generateMiniDump(spark: SparkSession, sourcePath: String, pidListPath: String, targetPath: String): Unit = {
    import spark.implicits._
    val pidSchema = new StructType().add("pid", StringType).add("pidType", StringType).add("type", StringType)
    val idSchema = new StructType().add("id", StringType)
    val idWithPidSchema = new StructType()
-      .add("id", StringType).
-      add("pid", ArrayType(Encoders.bean(classOf[StructuredProperty]).schema))
+      .add("id", StringType)
+      .add("pid", ArrayType(Encoders.bean(classOf[StructuredProperty]).schema))
    val typologies = List("publication", "dataset", "software", "otherresearchproduct", "project", "organization")

-
    val pidList = spark.read.schema(pidSchema).json(pidListPath)

-
    typologies.foreach(t => {
      println(s"filtering $t")
-        val entity = spark.read.schema(idWithPidSchema).json(s"$sourcePath/$t")
+      val entity = spark.read
+        .schema(idWithPidSchema)
+        .json(s"$sourcePath/$t")
        .selectExpr("explode(pid) as pids", "id")
        .selectExpr("id", "pids.value as pid", "pids.qualifier.classid as pidType")
        .distinct()
-        val filerId = entity.join(pidList, pidList("pid") === entity("pid"))
-          .select("id").distinct()
+      val filerId = entity
+        .join(pidList, pidList("pid") === entity("pid"))
+        .select("id")
+        .distinct()

      val currentEntity = spark.read.text(s"$sourcePath/$t")
-      val resultWithId=currentEntity.withColumn("jsonData",from_json(col("value"),idSchema)).selectExpr("jsonData.id as id", "value")
-      resultWithId.join(filerId, resultWithId("id") === filerId("id"))
+      val resultWithId =
+        currentEntity.withColumn("jsonData", from_json(col("value"), idSchema)).selectExpr("jsonData.id as id", "value")
+      resultWithId
+        .join(filerId, resultWithId("id") === filerId("id"))
        .select("value")
        .repartition(10)
        .write
@ -55,10 +58,10 @@ class SparkCreateMiniDumpGraph(propertyPath: String, args: Array[String], log: L
        .text(s"$targetPath/$t")
    })

-
    val emptyDataset = spark.createDataset(Seq.empty[String]).toDF("id")

-    typologies.foldLeft(emptyDataset)((res, item) => {
+    typologies
+      .foldLeft(emptyDataset)((res, item) => {
        println(s"adding $item")
        res.union(
          spark.read
@ -67,16 +70,20 @@ class SparkCreateMiniDumpGraph(propertyPath: String, args: Array[String], log: L
            .selectExpr("id")
            .distinct()
        )
-    }).distinct()
-      .write.mode(SaveMode.Overwrite).save(s"$targetPath/usedIds")
-
+      })
+      .distinct()
+      .write
+      .mode(SaveMode.Overwrite)
+      .save(s"$targetPath/usedIds")

    val filteredIds = spark.read.load(s"$targetPath/usedIds")
    val relations = spark.read.schema(Encoders.bean(classOf[Relation]).schema).json(s"$sourcePath/relation")
-    val filteredRelations = relations.join(filteredIds, relations("source") === filteredIds("id") || relations("target") === filteredIds("id"))
+    val filteredRelations = relations.join(
+      filteredIds,
+      relations("source") === filteredIds("id") || relations("target") === filteredIds("id")
+    )

-    filteredRelations
-      .write
+    filteredRelations.write
      .mode(SaveMode.Overwrite)
      .option("compression", "gzip")
      .json(s"$targetPath/relation")
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/minidump/MiniDumpTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/minidump/MiniDumpTest.scala
@ -9,9 +9,15 @@ class MiniDumpTest {
  def testMiniDumpGeneration(): Unit = {
    val spark = SparkSession.builder().appName("MiniDumpTest").master("local[*]").getOrCreate()

-    val sparkCreateMiniDumpGraph = new SparkCreateMiniDumpGraph("src/test/resources/application.properties", Array(), null)
+    val sparkCreateMiniDumpGraph =
+      new SparkCreateMiniDumpGraph("src/test/resources/application.properties", Array(), null)

-    sparkCreateMiniDumpGraph.generateMiniDump(spark, "/home/sandro/OGraph/05_graph_inferred", "/home/sandro/OGraph/pid_json", "/home/sandro/OGraph/minidump")
+    sparkCreateMiniDumpGraph.generateMiniDump(
+      spark,
+      "/home/sandro/OGraph/05_graph_inferred",
+      "/home/sandro/OGraph/pid_json",
+      "/home/sandro/OGraph/minidump"
+    )

  }