Job to include hive graph in the current generation phase, after cleaning context

2024-11-27 18:15:23 +01:00 · 2024-11-27 18:15:23 +01:00 · 2b666c8aa6
parent 1d80c1da57
commit 2b666c8aa6
4 changed files with 104 additions and 3 deletions
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java
@ -130,7 +130,7 @@ public class ResultTagger implements Serializable {
 					// log.info("Remove constraints for " + communityId);
 					if (conf.getRemoveConstraintsMap().keySet().contains(communityId) &&
 						conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
-							!conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() &&
+						!conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() &&
 						conf
 							.getRemoveConstraintsMap()
 							.get(communityId)
@ -228,7 +228,7 @@ public class ResultTagger implements Serializable {
 			.forEach(communityId -> {
 				if (!removeCommunities.contains(communityId) &&
 					conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
-						!conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() &&
+					!conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() &&
 					conf
 						.getSelectionConstraintsMap()
 						.get(communityId)
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
@ -90,7 +90,7 @@ public class SparkCountryPropagationJob {

 		if (!preparedInfoRaw.isEmpty()) {
 			res
-				.joinWith(preparedInfoRaw, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
+				.joinWith(preparedInfoRaw, res.col("id").equalTo(preparedInfoRaw.col("resultId")), "left_outer")
 				.map(getCountryMergeFn(), Encoders.bean(resultClazz))
 				.write()
 				.option("compression", "gzip")
--- a/dhp-workflows/dhp-incremental-graph/pom.xml
+++ b/dhp-workflows/dhp-incremental-graph/pom.xml
@ -53,6 +53,11 @@
            <artifactId>dhp-aggregation</artifactId>
            <version>${project.version}</version>
        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-enrichment</artifactId>
+            <version>${project.version}</version>
+        </dependency>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-graph-mapper</artifactId>
--- a/dhp-workflows/dhp-incremental-graph/src/main/java/eu/dnetlib/dhp/incremental/SparkAppendContextCleanedGraph.scala
+++ b/dhp-workflows/dhp-incremental-graph/src/main/java/eu/dnetlib/dhp/incremental/SparkAppendContextCleanedGraph.scala
@ -0,0 +1,96 @@
+package eu.dnetlib.dhp.incremental
+
+import eu.dnetlib.dhp.PropagationConstant
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.bulktag.community.TaggingConstants
+import eu.dnetlib.dhp.schema.common.ModelSupport
+import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity}
+import org.apache.commons.io.IOUtils
+import org.apache.spark.SparkConf
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsScalaMapConverter, seqAsJavaListConverter}
+
+object SparkAppendContextCleanedGraph {
+
+  def main(args: Array[String]): Unit = {
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val conf: SparkConf = new SparkConf()
+
+    val parser = new ArgumentApplicationParser(
+      IOUtils.toString(
+        getClass.getResourceAsStream(
+          "/eu/dnetlib/dhp/oa/graph/incremental/resolution/resolve_relationsbyid_params.json"
+        )
+      )
+    )
+    parser.parseArgument(args)
+    conf.set("hive.metastore.uris", parser.get("hiveMetastoreUris"))
+
+    val graphBasePath = parser.get("graphBasePath")
+    log.info(s"graphBasePath  -> $graphBasePath")
+    val relationPath = parser.get("relationPath")
+    log.info(s"relationPath  -> $relationPath")
+    val targetPath = parser.get("targetGraph")
+    log.info(s"targetGraph  -> $targetPath")
+
+    val hiveDbName = parser.get("hiveDbName")
+    log.info(s"hiveDbName  -> $hiveDbName")
+
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .enableHiveSupport()
+        .appName(getClass.getSimpleName)
+        .getOrCreate()
+
+    for ((entity, clazz) <- ModelSupport.oafTypes.asScala) {
+      if (classOf[OafEntity].isAssignableFrom(clazz)) {
+        val classEnc: Encoder[Oaf] = Encoders.bean(clazz).asInstanceOf[Encoder[Oaf]]
+
+        spark
+          .table(s"${hiveDbName}.${entity}")
+          .as(classEnc)
+          .map(e => {
+            val oaf = e.asInstanceOf[OafEntity]
+            if (oaf.getContext != null) {
+              val newContext = oaf.getContext.asScala
+                .map(c => {
+                  if (c.getDataInfo != null) {
+                    c.setDataInfo(
+                      c.getDataInfo.asScala
+                        .filter(
+                          di =>
+                            !di.getInferenceprovenance.equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE)
+                            && !di.getInferenceprovenance.equals(TaggingConstants.BULKTAG_DATA_INFO_TYPE)
+                        )
+                        .toList
+                        .asJava
+                    )
+                  }
+                  c
+                })
+                .filter(!_.getDataInfo.isEmpty)
+                .toList
+                .asJava
+              oaf.setContext(newContext)
+            }
+            e
+          })(classEnc)
+          .write
+          .option("compression", "gzip")
+          .mode(SaveMode.Append)
+          .json(s"$targetPath/${entity}")
+      } else {
+        spark
+          .table(s"${hiveDbName}.${entity}")
+          .write
+          .option("compression", "gzip")
+          .mode(SaveMode.Append)
+          .json(s"$targetPath/${entity}")
+      }
+    }
+  }
+}