Hosted By Map - refactoring and application of the new aggregator

2021-08-04 10:14:20 +02:00 · 2021-08-04 10:14:20 +02:00 · 8f7623e77a
parent a7bf314fd2
commit 8f7623e77a
1 changed files with 5 additions and 7 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala
@ -45,7 +45,7 @@ object SparkApplyHostedByMapToDatasource {
    val graphPath = parser.get("graphPath")

    val outputPath = parser.get("outputPath")
-    val workingPath = parser.get("workingPath")
+    val preparedInfoPath = parser.get("preparedInfoPath")


    implicit val formats = DefaultFormats
@ -55,17 +55,15 @@ object SparkApplyHostedByMapToDatasource {
    implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
    val mapper = new ObjectMapper()

-    val dats : Dataset[Datasource] = spark.read.textFile("$graphPath/datasource")
+    val dats : Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource")
      .map(r => mapper.readValue(r, classOf[Datasource]))

-    val pinfo : Dataset[EntityInfo] = spark.read.textFile("$workingPath/preparedInfo")
-      .map(ei => mapper.readValue(ei, classOf[EntityInfo]))
-
-
+    val pinfo : Dataset[EntityInfo] = Aggregators.datasourceToSingleId( spark.read.textFile(preparedInfoPath)
+      .map(ei => mapper.readValue(ei, classOf[EntityInfo])))

    //c. dataset join risultato del passo prima di a per datasource id, gruppo per ds id e cambio compatibilita' se necessario

-    applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(s"$graphPath/datasource")
+    applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
  }