From 7423577a082318d6d7648945243e237c9f847029 Mon Sep 17 00:00:00 2001
From: "miriam.baglioni" <miriam.baglioni@isti.cnr.it>
Date: Tue, 21 Jun 2022 14:51:38 +0200
Subject: [PATCH] [Graph DUMP] add code to produce the delta of new projects
 with respect to the previous delta/dump

---
 .../ProjectsSubsetSparkJob.java               |  80 ++++++++
 .../graph/dump/project_subset_parameters.json |  27 +++
 .../oozie_app/config-default.xml              |  30 +++
 .../dump/projectsubset/oozie_app/workflow.xml | 171 ++++++++++++++++++
 4 files changed, 308 insertions(+)
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/projectssubset/ProjectsSubsetSparkJob.java
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_subset_parameters.json
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/projectsubset/oozie_app/config-default.xml
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/projectsubset/oozie_app/workflow.xml

diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/projectssubset/ProjectsSubsetSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/projectssubset/ProjectsSubsetSparkJob.java
new file mode 100644
index 0000000000..67da24185b
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/projectssubset/ProjectsSubsetSparkJob.java
@@ -0,0 +1,80 @@
+package eu.dnetlib.dhp.oa.graph.dump.projectssubset;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import java.io.Serializable;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.stream.Collectors;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.jetbrains.annotations.NotNull;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.graph.dump.Utils;
+import eu.dnetlib.dhp.schema.dump.oaf.community.Funder;
+import eu.dnetlib.dhp.schema.dump.oaf.graph.Project;
+import scala.Tuple2;
+public class ProjectsSubsetSparkJob implements Serializable {
+    private static final Logger log = LoggerFactory.getLogger(ProjectsSubsetSparkJob.class);
+    public static void main(String[] args) throws Exception {
+        String jsonConfiguration = IOUtils
+                .toString(
+                        ProjectsSubsetSparkJob.class
+                                .getResourceAsStream(
+                                        "/eu/dnetlib/dhp/oa/graph/dump/project_subset_parameters.json"));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
+        parser.parseArgument(args);
+        Boolean isSparkSessionManaged = Optional
+                .ofNullable(parser.get("isSparkSessionManaged"))
+                .map(Boolean::valueOf)
+                .orElse(Boolean.TRUE);
+        log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+        final String inputPath = parser.get("sourcePath");
+        log.info("inputPath: {}", inputPath);
+        final String outputPath = parser.get("outputPath");
+        log.info("outputPath: {}", outputPath);
+        final String projectListPath = parser.get("projectListPath");
+        log.info("projectListPath: {}", projectListPath);
+        SparkConf conf = new SparkConf();
+        runWithSparkSession(
+                conf,
+                isSparkSessionManaged,
+                spark -> {
+                    Utils.removeOutputDir(spark, outputPath);
+                    getNewProjectList(spark, inputPath, outputPath, projectListPath);
+                });
+    }
+    private static void getNewProjectList(SparkSession spark, String inputPath, String outputPath,
+                                          String projectListPath) {
+        Dataset<String> projectList = spark.read().textFile(projectListPath);
+        // projectList.show(false);
+        Dataset<Project> projects;
+        projects = Utils.readPath(spark, inputPath, Project.class);
+        projects
+                .joinWith(projectList, projects.col("id").equalTo(projectList.col("value")), "left")
+                .map((MapFunction<Tuple2<Project, String>, Project>) t2 -> {
+                    if (Optional.ofNullable(t2._2()).isPresent())
+                        return null;
+                    return t2._1();
+                }, Encoders.bean(Project.class))
+                .filter(Objects::nonNull)
+                .write()
+                .mode(SaveMode.Overwrite)
+                .option("compression", "gzip")
+                .json(outputPath);
+        Utils
+                .readPath(spark, outputPath, Project.class)
+                .map((MapFunction<Project, String>) p -> p.getId(), Encoders.STRING())
+                .write()
+                .mode(SaveMode.Append)
+                .option("compression", "gzip")
+                .text(projectListPath);
+    }
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_subset_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_subset_parameters.json
new file mode 100644
index 0000000000..ed23136806
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/project_subset_parameters.json
@@ -0,0 +1,27 @@
+[
+  {
+    "paramName":"s",
+    "paramLongName":"sourcePath",
+    "paramDescription": "the path of the sequencial file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "out",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path used to store temporary output files",
+    "paramRequired": true
+  },
+  {
+    "paramName": "ssm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "true if the spark session is managed, false otherwise",
+    "paramRequired": false
+  },
+  {
+    "paramName": "pl",
+    "paramLongName": "projectListPath",
+    "paramDescription": "the path of the association result projectlist",
+    "paramRequired": true
+  }
+]
+
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/projectsubset/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/projectsubset/oozie_app/config-default.xml
new file mode 100644
index 0000000000..d262cb6e05
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/projectsubset/oozie_app/config-default.xml
@@ -0,0 +1,30 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>hiveMetastoreUris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>hiveJdbcUrl</name>
+        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
+    </property>
+    <property>
+        <name>hiveDbName</name>
+        <value>openaire</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+</configuration>
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/projectsubset/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/projectsubset/oozie_app/workflow.xml
new file mode 100644
index 0000000000..619e3aa207
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/projectsubset/oozie_app/workflow.xml
@@ -0,0 +1,171 @@
+<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the source path</description>
+        </property>
+        <property>
+            <name>projectListPath</name>
+            <description>the path to the project list</description>
+        </property>
+        <property>
+            <name>outputPath</name>
+            <description>the output path</description>
+        </property>
+        <property>
+            <name>accessToken</name>
+            <description>the access token used for the deposition in Zenodo</description>
+        </property>
+        <property>
+            <name>connectionUrl</name>
+            <description>the connection url for Zenodo</description>
+        </property>
+        <property>
+            <name>metadata</name>
+            <description> the metadata associated to the deposition</description>
+        </property>
+        <property>
+            <name>depositionType</name>
+            <description>the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided)</description>
+        </property>
+        <property>
+            <name>conceptRecordId</name>
+            <description>for new version, the id of the record for the old deposition</description>
+        </property>
+        <property>
+            <name>depositionId</name>
+            <description>the depositionId of a deposition open that has to be added content</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+    <start to="dump_project"/>
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <action name="dump_project">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dump table project </name>
+            <class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
+            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/project</arg>
+            <arg>--communityMapPath</arg><arg>noneed</arg>
+        </spark>
+        <ok to="get_new_projects"/>
+        <error to="Kill"/>
+    </action>
+    <action name="get_new_projects">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Dump table project </name>
+            <class>eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectsSubsetSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${workingDir}/project</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/tar/project</arg>
+            <arg>--projectListPath</arg><arg>${projectListPath}</arg>
+        </spark>
+        <ok to="make_archive"/>
+        <error to="Kill"/>
+    </action>
+    <action name="make_archive">
+        <java>
+            <main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>
+            <arg>--hdfsPath</arg><arg>${outputPath}</arg>
+            <arg>--nameNode</arg><arg>${nameNode}</arg>
+            <arg>--sourcePath</arg><arg>${workingDir}/tar</arg>
+        </java>
+        <ok to="send_zenodo"/>
+        <error to="Kill"/>
+    </action>
+    <action name="send_zenodo">
+        <java>
+            <main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
+            <arg>--hdfsPath</arg><arg>${outputPath}</arg>
+            <arg>--nameNode</arg><arg>${nameNode}</arg>
+            <arg>--accessToken</arg><arg>${accessToken}</arg>
+            <arg>--connectionUrl</arg><arg>${connectionUrl}</arg>
+            <arg>--metadata</arg><arg>${metadata}</arg>
+            <arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
+            <arg>--depositionType</arg><arg>${depositionType}</arg>
+            <arg>--depositionId</arg><arg>${depositionId}</arg>
+        </java>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    <end name="End"/>
+</workflow-app>
\ No newline at end of file