hive to hdfs export job with context cleaning for incremental graph

2025-01-13 09:22:29 +01:00 · 2025-01-13 09:22:29 +01:00 · c9ec9ec726
parent 2b666c8aa6
commit c9ec9ec726
7 changed files with 233 additions and 14 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableExporterJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableExporterJob.java
@ -0,0 +1,80 @@
 package eu.dnetlib.dhp.oa.graph.hive;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 public class GraphHiveTableExporterJob {
 	private static final Logger log = LoggerFactory.getLogger(GraphHiveTableExporterJob.class);
 	public static void main(String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
 					GraphHiveTableExporterJob.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/oa/graph/hive_db_exporter_parameters.json")));
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		int numPartitions = Optional
 			.ofNullable(parser.get("numPartitions"))
 			.map(Integer::valueOf)
 			.orElse(-1);
 		log.info("numPartitions: {}", numPartitions);
 		String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		String hiveTableName = parser.get("hiveTableName");
 		log.info("hiveTableName: {}", hiveTableName);
 		String hiveMetastoreUris = parser.get("hiveMetastoreUris");
 		log.info("hiveMetastoreUris: {}", hiveMetastoreUris);
 		String mode = parser.get("mode");
 		log.info("mode: {}", mode);
 		SparkConf conf = new SparkConf();
 		conf.set("hive.metastore.uris", hiveMetastoreUris);
 		runWithSparkHiveSession(
 			conf, isSparkSessionManaged,
 			spark -> saveGraphTable(spark, outputPath, hiveTableName, mode, numPartitions));
 	}
 	// protected for testing
 	private static <T extends Oaf> void saveGraphTable(SparkSession spark, String outputPath, String hiveTableName,
 		String mode, int numPartitions) {
 		Dataset<Row> dataset = spark.table(hiveTableName);
 		if (numPartitions > 0) {
 			log.info("repartitioning to {} partitions", numPartitions);
 			dataset = dataset.repartition(numPartitions);
 		}
 		dataset
 			.write()
 			.mode(mode)
 			.option("compression", "gzip")
 			.json(outputPath);
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_exporter_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_exporter_parameters.json
@ -0,0 +1,32 @@
 [
  {
    "paramName": "issm",
    "paramLongName": "isSparkSessionManaged",
    "paramDescription": "when true will stop SparkSession after job execution",
    "paramRequired": false
  },
  {
    "paramName": "out",
    "paramLongName": "outputPath",
    "paramDescription": "the path to the graph data dump to read",
    "paramRequired": true
  },
  {
    "paramName": "mode",
    "paramLongName": "mode",
    "paramDescription": "mode (append|overwrite)",
    "paramRequired": true
  },
  {
    "paramName": "hmu",
    "paramLongName": "hiveMetastoreUris",
    "paramDescription": "the hive metastore uris",
    "paramRequired": true
  },
  {
    "paramName": "db",
    "paramLongName": "hiveTableName",
    "paramDescription": "the input hive table identifier",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-incremental-graph/src/main/java/eu/dnetlib/dhp/incremental/SparkAppendContextCleanedGraph.scala
+++ b/dhp-workflows/dhp-incremental-graph/src/main/java/eu/dnetlib/dhp/incremental/SparkAppendContextCleanedGraph.scala
@ -21,19 +21,15 @@ object SparkAppendContextCleanedGraph {
    val parser = new ArgumentApplicationParser(
      IOUtils.toString(
        getClass.getResourceAsStream(
-          "/eu/dnetlib/dhp/oa/graph/incremental/resolution/resolve_relationsbyid_params.json"
+          "/eu/dnetlib/dhp/oa/graph/incremental/export_hive/append_context_cleaned_graph.json"
        )
      )
    )
    parser.parseArgument(args)
    conf.set("hive.metastore.uris", parser.get("hiveMetastoreUris"))
-    val graphBasePath = parser.get("graphBasePath")
+    val outputPath = parser.get("outputPath")
-    log.info(s"graphBasePath  -> $graphBasePath")
+    log.info(s"outputPath  -> $outputPath")
    val relationPath = parser.get("relationPath")
    log.info(s"relationPath  -> $relationPath")
    val targetPath = parser.get("targetGraph")
    log.info(s"targetGraph  -> $targetPath")
    val hiveDbName = parser.get("hiveDbName")
    log.info(s"hiveDbName  -> $hiveDbName")
@ -46,7 +42,7 @@ object SparkAppendContextCleanedGraph {
        .appName(getClass.getSimpleName)
        .getOrCreate()
-    for ((entity, clazz) <- ModelSupport.oafTypes.asScala) {
+    for ((entity, clazz) <- ModelSupport.oafTypes.asScala.filter(t => !Seq("datasource", "organization", "person", "project").contains(t._1))) {
      if (classOf[OafEntity].isAssignableFrom(clazz)) {
        val classEnc: Encoder[Oaf] = Encoders.bean(clazz).asInstanceOf[Encoder[Oaf]]
@ -63,8 +59,9 @@ object SparkAppendContextCleanedGraph {
                      c.getDataInfo.asScala
                        .filter(
                          di =>
-                            !di.getInferenceprovenance.equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE)
+                            di == null || di.getInferenceprovenance == null ||
-                            && !di.getInferenceprovenance.equals(TaggingConstants.BULKTAG_DATA_INFO_TYPE)
+                              (!di.getInferenceprovenance.equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE)
                            && !di.getInferenceprovenance.equals(TaggingConstants.BULKTAG_DATA_INFO_TYPE))
                        )
                        .toList
                        .asJava
@ -82,14 +79,14 @@ object SparkAppendContextCleanedGraph {
          .write
          .option("compression", "gzip")
          .mode(SaveMode.Append)
-          .json(s"$targetPath/${entity}")
+          .json(s"$outputPath/${entity}")
      } else {
        spark
          .table(s"${hiveDbName}.${entity}")
          .write
          .option("compression", "gzip")
          .mode(SaveMode.Append)
-          .json(s"$targetPath/${entity}")
+          .json(s"$outputPath/${entity}")
      }
    }
  }
--- a/dhp-workflows/dhp-incremental-graph/src/main/resources/eu/dnetlib/dhp/oa/graph/incremental/export_hive/append_context_cleaned_graph.json
+++ b/dhp-workflows/dhp-incremental-graph/src/main/resources/eu/dnetlib/dhp/oa/graph/incremental/export_hive/append_context_cleaned_graph.json
@ -0,0 +1,20 @@
 [
  {
    "paramName": "out",
    "paramLongName": "outputPath",
    "paramDescription": "the path to the graph data dump to read",
    "paramRequired": true
  },
  {
    "paramName": "hmu",
    "paramLongName": "hiveMetastoreUris",
    "paramDescription": "the hive metastore uris",
    "paramRequired": true
  },
  {
    "paramName": "db",
    "paramLongName": "hiveDbName",
    "paramDescription": "the input hive database identifier",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-incremental-graph/src/main/resources/eu/dnetlib/dhp/oa/graph/incremental/export_hive/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-incremental-graph/src/main/resources/eu/dnetlib/dhp/oa/graph/incremental/export_hive/oozie_app/config-default.xml
@ -0,0 +1,26 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>hiveMetastoreUris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>hiveJdbcUrl</name>
        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
    </property>
    <property>
        <name>hiveDbName</name>
        <value>openaire</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-incremental-graph/src/main/resources/eu/dnetlib/dhp/oa/graph/incremental/export_hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-incremental-graph/src/main/resources/eu/dnetlib/dhp/oa/graph/incremental/export_hive/oozie_app/workflow.xml
@ -0,0 +1,63 @@
 <workflow-app name="import_graph_as_hive_DB" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>outputPath</name>
            <description>the source path</description>
        </property>
        <property>
            <name>hiveDbName</name>
            <description>the target hive database name</description>
        </property>
        <property>
            <name>hiveMetastoreUris</name>
            <description>hive server metastore URIs</description>
        </property>
        <!-- General oozie workflow properties -->
        <property>
            <name>sparkClusterOpts</name>
            <value>--conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory</value>
            <description>spark cluster-wide options</description>
        </property>
        <property>
            <name>sparkResourceOpts</name>
            <value>--executor-memory=3G --conf spark.executor.memoryOverhead=3G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
            <description>spark resource options</description>
        </property>
        <property>
            <name>sparkApplicationOpts</name>
            <value>--conf spark.sql.shuffle.partitions=1024</value>
            <description>spark resource options</description>
        </property>
    </parameters>
    <start to="merge_db_entities"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="merge_db_entities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Merge Oaf Entities from hive db</name>
            <class>eu.dnetlib.dhp.incremental.SparkAppendContextCleanedGraph</class>
            <jar>dhp-incremental-graph-${projectVersion}.jar</jar>
            <spark-opts>
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
                ${sparkClusterOpts}
                ${sparkResourceOpts}
                ${sparkApplicationOpts}
            </spark-opts>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
            <arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
            <arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-incremental-graph/src/main/resources/eu/dnetlib/dhp/oa/graph/incremental/resolution/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-incremental-graph/src/main/resources/eu/dnetlib/dhp/oa/graph/incremental/resolution/oozie_app/workflow.xml
@ -98,10 +98,10 @@
            <arg>${nameNode}/${graphBasePath}/otherresearchproduct</arg>
            <arg>${nameNode}/${targetGraph}/otherresearchproduct</arg>
        </distcp>
-        <ok to="copy_person"/>
+        <ok to="copy_project"/>
        <error to="Kill"/>
    </action>
-    <action name="copy_person">
+<!--    <action name="copy_person">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
            <arg>${nameNode}/${graphBasePath}/person</arg>
            <arg>${nameNode}/${targetGraph}/person</arg>
@ -109,6 +109,7 @@
        <ok to="copy_project"/>
        <error to="Kill"/>
    </action>
 -->
    <action name="copy_project">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
            <arg>${nameNode}/${graphBasePath}/project</arg>