new logic and workflow for dump of results with link to projects. In this implementation the result match the model of the communityresult.

2020-11-19 19:15:39 +01:00 · 2020-11-19 19:15:39 +01:00 · 24c56fa7a3
parent fafb688887
commit 24c56fa7a3
5 changed files with 501 additions and 27 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/FunderResults.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/FunderResults.java
@ -3,9 +3,9 @@ package eu.dnetlib.dhp.oa.graph.dump.funderresults;
 import java.io.Serializable;
-import eu.dnetlib.dhp.schema.dump.oaf.Result;
+import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
-public class FunderResults extends Result implements Serializable {
+public class FunderResults extends CommunityResult implements Serializable {
 	private String funder_id;
 	public String getFunder_id() {
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java
@ -17,9 +17,11 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.api.zenodo.Community;
 import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
 import eu.dnetlib.dhp.oa.graph.dump.Utils;
 import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
 import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import scala.Tuple2;
@ -53,8 +55,8 @@ public class SparkDumpFunderResults implements Serializable {
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
-		final String communityMapPath = parser.get("communityMapPath");
+		final String relationPath = parser.get("relationPath");
-		log.info("communityMapPath: {}", communityMapPath);
+		log.info("relationPath: {}", relationPath);
 		SparkConf conf = new SparkConf();
@ -63,29 +65,26 @@ public class SparkDumpFunderResults implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
 				Utils.removeOutputDir(spark, outputPath);
-				writeResultProjectList(spark, inputPath, outputPath, communityMapPath);
+				writeResultProjectList(spark, inputPath, outputPath);
 			});
 	}
-	private static void writeResultProjectList(SparkSession spark, String inputPath, String outputPath,
+	private static void writeResultProjectList(SparkSession spark, String inputPath, String outputPath) {
 											   String communityMapPath) {
 		CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
 		Dataset<Relation> relation = Utils
 			.readPath(spark, inputPath + "/relation", Relation.class)
 			.filter("dataInfo.deletedbyinference = false and relClass = 'produces'");
-		Dataset<eu.dnetlib.dhp.schema.oaf.Result> result = Utils
+		Dataset<CommunityResult> result = Utils
-			.readPath(spark, inputPath + "/publication", eu.dnetlib.dhp.schema.oaf.Result.class)
+			.readPath(spark, inputPath + "/publication", CommunityResult.class)
-			.union(Utils.readPath(spark, inputPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Result.class))
+			.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
-			.union(Utils.readPath(spark, inputPath + "/otherresearchproduct", eu.dnetlib.dhp.schema.oaf.Result.class))
+			.union(Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class))
-			.union(Utils.readPath(spark, inputPath + "/software", eu.dnetlib.dhp.schema.oaf.Result.class));
+			.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));
 		result
 			.joinWith(relation, result.col("id").equalTo(relation.col("target")), "inner")
-				.map((MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Result, Relation>, FunderResults>) value ->{
+			.map((MapFunction<Tuple2<CommunityResult, Relation>, FunderResults>) value -> {
-					FunderResults res = (FunderResults) ResultMapper.map(value._1(), communityMap, false);
+				FunderResults res = (FunderResults) value._1();
 				res.setFunder_id(value._2().getSource().substring(3, 15));
 				return res;
 			}, Encoders.bean(FunderResults.class))
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkResultLinkedToProject.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkResultLinkedToProject.java
@ -0,0 +1,82 @@
 package eu.dnetlib.dhp.oa.graph.dump.funderresults;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
 import java.util.Optional;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.graph.dump.Utils;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class SparkResultLinkedToProject implements Serializable {
 	private static final Logger log = LoggerFactory.getLogger(SparkResultLinkedToProject.class);
 	public static void main(String[] args) throws Exception {
 		String jsonConfiguration = IOUtils
 			.toString(
 				SparkResultLinkedToProject.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/oa/graph/dump/input_parameters_link_prj.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
 		parser.parseArgument(args);
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String inputPath = parser.get("sourcePath");
 		log.info("inputPath: {}", inputPath);
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 		final String resultClassName = parser.get("resultTableName");
 		log.info("resultTableName: {}", resultClassName);
 		final String relationPath = parser.get("relationPath");
 		log.info("relationPath: {}", relationPath);
 		Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				Utils.removeOutputDir(spark, outputPath);
 				writeResultsLikedToProjects(spark, inputClazz, inputPath, outputPath, relationPath);
 			});
 	}
 	private static <R extends Result> void writeResultsLikedToProjects(SparkSession spark, Class<R> inputClazz,
 		String inputPath, String outputPath, String relationPath) {
 		Dataset<R> results = Utils.readPath(spark, inputPath, inputClazz);
 		Dataset<Relation> relations = Utils
 			.readPath(spark, relationPath, Relation.class)
 			.filter("dataInfo.deletedbyinference = false and relClass = 'produces'");
 		relations
 			.joinWith(
 				results, relations.col("target").equalTo(results.col("id")),
 				"inner")
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
 			.json(outputPath);
 	}
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml
@ -127,10 +127,367 @@
            <arg>--nameNode</arg><arg>${nameNode}</arg>
            <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
        </java>
-        <ok to="dump_funder_results"/>
+        <ok to="fork_result_linked_to_projects"/>
        <error to="Kill"/>
    </action>
    <fork name="fork_result_linked_to_projects">
        <path start="select_publication_linked_to_projects"/>
        <path start="select_dataset_linked_to_projects"/>
        <path start="select_orp_linked_to_project"/>
        <path start="select_software_linked_to_projects"/>
    </fork>
    <action name="select_publication_linked_to_projects">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dump funder results </name>
            <class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--outputPath</arg><arg>${workingDir}/result/publication</arg>
            <arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
        </spark>
        <ok to="join_link"/>
        <error to="Kill"/>
    </action>
    <action name="select_dataset_linked_to_projects">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dump funder results </name>
            <class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--outputPath</arg><arg>${workingDir}/result/dataset</arg>
            <arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
        </spark>
        <ok to="join_link"/>
        <error to="Kill"/>
    </action>
    <action name="select_orp_linked_to_project">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dump funder results </name>
            <class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--outputPath</arg><arg>${workingDir}/result/otherresearchproduct</arg>
            <arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
        </spark>
        <ok to="join_link"/>
        <error to="Kill"/>
    </action>
    <action name="select_software_linked_to_projects">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dump funder results </name>
            <class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--outputPath</arg><arg>${workingDir}/result/software</arg>
            <arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
        </spark>
        <ok to="join_link"/>
        <error to="Kill"/>
    </action>
    <join name="join_link" to="fork_dump"/>
    <fork name="fork_dump">
        <path start="dump_publication"/>
        <path start="dump_dataset"/>
        <path start="dump_orp"/>
        <path start="dump_software"/>
    </fork>
    <action name="dump_publication">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dump table publication for community related products</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/result/publication</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
            <arg>--outputPath</arg><arg>${workingDir}/dump/publication</arg>
            <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
        </spark>
        <ok to="join_dump"/>
        <error to="Kill"/>
    </action>
    <action name="dump_dataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dump table dataset for community related products</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/result/dataset</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--outputPath</arg><arg>${workingDir}/dump/dataset</arg>
            <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
        </spark>
        <ok to="join_dump"/>
        <error to="Kill"/>
    </action>
    <action name="dump_orp">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dump table ORP for community related products</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/result/otherresearchproduct</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--outputPath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
            <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
        </spark>
        <ok to="join_dump"/>
        <error to="Kill"/>
    </action>
    <action name="dump_software">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Dump table software for community related products</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/result/software</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
            <arg>--outputPath</arg><arg>${workingDir}/dump/software</arg>
            <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
        </spark>
        <ok to="join_dump"/>
        <error to="Kill"/>
    </action>
    <join name="join_dump" to="prepareResultProject"/>
    <action name="prepareResultProject">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Prepare association result subset of project info</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkPrepareResultProject</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="fork_extendWithProject"/>
        <error to="Kill"/>
    </action>
    <fork name="fork_extendWithProject">
        <path start="extend_publication"/>
        <path start="extend_dataset"/>
        <path start="extend_orp"/>
        <path start="extend_software"/>
    </fork>
    <action name="extend_publication">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Extend dumped publications with information about project</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/dump/publication</arg>
            <arg>--outputPath</arg><arg>${workingDir}/ext/publication</arg>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="join_extend"/>
        <error to="Kill"/>
    </action>
    <action name="extend_dataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Extend dumped dataset with information about project</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/dump/dataset</arg>
            <arg>--outputPath</arg><arg>${workingDir}/ext/dataset</arg>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="join_extend"/>
        <error to="Kill"/>
    </action>
    <action name="extend_orp">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Extend dumped ORP with information about project</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
            <arg>--outputPath</arg><arg>${workingDir}/ext/orp</arg>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="join_extend"/>
        <error to="Kill"/>
    </action>
    <action name="extend_software">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Extend dumped software with information about project</name>
            <class>eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${workingDir}/dump/software</arg>
            <arg>--outputPath</arg><arg>${workingDir}/ext/software</arg>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="join_extend"/>
        <error to="Kill"/>
    </action>
    <join name="join_extend" to="dump_funder_results"/>
    <action name="dump_funder_results">
        <spark xmlns="uri:oozie:spark-action:0.2">
@ -149,9 +506,9 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
-            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--sourcePath</arg><arg>${workingDir}/ext</arg>
-            <arg>--outputPath</arg><arg>${workingDir}/result</arg>
+            <arg>--outputPath</arg><arg>${outputPath}</arg>
-            <arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
+            <arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_parameters_link_prj.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_parameters_link_prj.json
@ -0,0 +1,36 @@
 [
 	{
 		"paramName":"cmp",
 		"paramLongName":"communityMapPath",
 		"paramDescription": "the path to the serialization of the community map",
 		"paramRequired": true
 	},
 	{
 		"paramName":"s",
 		"paramLongName":"sourcePath",
 		"paramDescription": "the path of the sequencial file to read",
 		"paramRequired": true
 	},
 	{
 		"paramName": "out",
 		"paramLongName": "outputPath",
 		"paramDescription": "the path used to store temporary output files",
 		"paramRequired": true
 	},
 	{
 		"paramName": "ssm",
 		"paramLongName": "isSparkSessionManaged",
 		"paramDescription": "true if the spark session is managed, false otherwise",
 		"paramRequired": false
 	},
 	{
 		"paramName":"tn",
 		"paramLongName":"resultTableName",
 		"paramDescription": "the name of the result table we are currently working on",
 		"paramRequired": true
 	}
 ]