dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java


package eu.dnetlib.dhp.oa.graph.dump.funderresults;

import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;

import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
import scala.Tuple2;

/**
 * Splits the dumped results by funder and stores them in a folder named as the funder nsp (for all the funders, but the EC
 * for the EC it specifies also the fundingStream (FP7 or H2020)
 */
public class SparkDumpFunderResults implements Serializable {
	private static final Logger log = LoggerFactory.getLogger(SparkDumpFunderResults.class);

	public static void main(String[] args) throws Exception {
		String jsonConfiguration = IOUtils
			.toString(
				SparkDumpFunderResults.class
					.getResourceAsStream(
						"/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json"));

		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
		parser.parseArgument(args);

		Boolean isSparkSessionManaged = Optional
			.ofNullable(parser.get("isSparkSessionManaged"))
			.map(Boolean::valueOf)
			.orElse(Boolean.TRUE);
		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

		final String inputPath = parser.get("sourcePath");
		log.info("inputPath: {}", inputPath);

		final String outputPath = parser.get("outputPath");
		log.info("outputPath: {}", outputPath);

		final String graphPath = parser.get("graphPath");
		log.info("relationPath: {}", graphPath);

		SparkConf conf = new SparkConf();

		runWithSparkSession(
			conf,
			isSparkSessionManaged,
			spark -> {
				Utils.removeOutputDir(spark, outputPath);
				writeResultProjectList(spark, inputPath, outputPath, graphPath);
			});
	}

	private static void writeResultProjectList(SparkSession spark, String inputPath, String outputPath,
		String graphPath) {

		Dataset<String> funderList = Utils
				.readPath(spark, inputPath + "/publication", CommunityResult.class)
				.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
				.union(Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class))
				.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class))
				.flatMap((FlatMapFunction<CommunityResult, String>) cr ->
								cr.getProjects().stream().map(p -> p.getFunder().getShortName()).collect(Collectors.toList()).iterator()
						, Encoders.STRING())
				.distinct();

		Dataset<CommunityResult> pubs;
		Dataset<CommunityResult> result ;
		pubs = Utils
				.readPath(spark, inputPath + "/publication", CommunityResult.class);
		Dataset<CommunityResult> dats = Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class);
		Dataset<CommunityResult> orp = Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class);
		Dataset<CommunityResult> sw = Utils.readPath(spark, inputPath + "/software", CommunityResult.class);
		result = pubs.union(dats).union(orp).union(sw);

		funderList.foreach((ForeachFunction<String>) funder ->
				getFunderResult(funder, inputPath, spark)
				.write()
							.mode(SaveMode.Overwrite)
							.option("compression", "gzip")
							.json(outputPath + "/" + funder)

		);


	}


	@Nullable
	private static Dataset<CommunityResult> getFunderResult(String funderName, String inputPath, SparkSession spark) {
		Dataset<CommunityResult> pubs;
		Dataset<CommunityResult> result ;
				pubs = Utils
				.readPath(spark, inputPath + "/publication", CommunityResult.class);
		Dataset<CommunityResult> dats = Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class);
		Dataset<CommunityResult> orp = Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class);
		Dataset<CommunityResult> sw = Utils.readPath(spark, inputPath + "/software", CommunityResult.class);
		result = pubs.union(dats).union(orp).union(sw);
		Dataset<CommunityResult> tmp = result.map((MapFunction<CommunityResult, CommunityResult>) cr -> {
					if (!Optional.ofNullable(cr.getProjects()).isPresent()) {
						return null;
					}
					for (Project p : cr.getProjects()) {
						if (p.getFunder().getShortName().equalsIgnoreCase(funderName)) {
							return cr;
						}
					}
					return null;
				}, Encoders.bean(CommunityResult.class))
				.filter(Objects::nonNull);
		System.out.println(tmp.count());
		return tmp;

	}


}
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`package eu.dnetlib.dhp.oa.graph.dump.funderresults;`

			`import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;`

			`import java.io.Serializable;`
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`import java.util.ArrayList;`
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`import java.util.List;`
			`import java.util.Objects;`
			`import java.util.Optional;`
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`import java.util.stream.Collectors;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`import com.fasterxml.jackson.databind.ObjectMapper;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`import org.apache.commons.io.IOUtils;`
			`import org.apache.spark.SparkConf;`
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`import org.apache.spark.api.java.function.FlatMapFunction;`
			`import org.apache.spark.api.java.function.ForeachFunction;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`import org.apache.spark.api.java.function.MapFunction;`
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`import org.apache.spark.api.java.function.MapGroupsFunction;`
fixing issue on previous implementation 2020-11-24 14:44:53 +01:00			`import org.apache.spark.sql.*;`
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`import org.jetbrains.annotations.Nullable;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser;`
			`import eu.dnetlib.dhp.oa.graph.dump.Utils;`
new logic and workflow for dump of results with link to projects. In this implementation the result match the model of the communityresult. 2020-11-19 19:15:39 +01:00			`import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;`
fixing issue on previous implementation 2020-11-24 14:44:53 +01:00			`import eu.dnetlib.dhp.schema.dump.oaf.community.Project;`
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`import scala.Tuple2;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`/**`
added java doc 2020-11-25 14:08:09 +01:00			`* Splits the dumped results by funder and stores them in a folder named as the funder nsp (for all the funders, but the EC`
			`* for the EC it specifies also the fundingStream (FP7 or H2020)`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`*/`
- 2020-11-18 18:56:48 +01:00			`public class SparkDumpFunderResults implements Serializable {`
			`private static final Logger log = LoggerFactory.getLogger(SparkDumpFunderResults.class);`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`public static void main(String[] args) throws Exception {`
			`String jsonConfiguration = IOUtils`
			`.toString(`
- 2020-11-18 18:56:48 +01:00			`SparkDumpFunderResults.class`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`.getResourceAsStream(`
changed parameter file with the ono associated to the job 2020-11-18 16:58:20 +01:00			`"/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json"));`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);`
			`parser.parseArgument(args);`

			`Boolean isSparkSessionManaged = Optional`
- 2021-10-01 12:59:47 +02:00			`.ofNullable(parser.get("isSparkSessionManaged"))`
			`.map(Boolean::valueOf)`
			`.orElse(Boolean.TRUE);`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`log.info("isSparkSessionManaged: {}", isSparkSessionManaged);`

			`final String inputPath = parser.get("sourcePath");`
			`log.info("inputPath: {}", inputPath);`

			`final String outputPath = parser.get("outputPath");`
			`log.info("outputPath: {}", outputPath);`

extended the logic of the dump to consider the validation date in the relation (also in the dumped result for communities and funders at the level of the project), the extention on the instance for the APC, the pid, the alternate identifiers, and the extention of the AccessRight to store the OpenAccessRoute. Added new resourec for testing and extended the old class to verify the new dump. Fixed also issue on relation dump: only relation whose source and target are entities in the graph are dumped. The same hold for references to projects 2021-08-06 18:56:18 +02:00			`final String graphPath = parser.get("graphPath");`
			`log.info("relationPath: {}", graphPath);`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`SparkConf conf = new SparkConf();`

			`runWithSparkSession(`
- 2021-10-01 12:59:47 +02:00			`conf,`
			`isSparkSessionManaged,`
			`spark -> {`
			`Utils.removeOutputDir(spark, outputPath);`
			`writeResultProjectList(spark, inputPath, outputPath, graphPath);`
			`});`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`}`

fixing issue on previous implementation 2020-11-24 14:44:53 +01:00			`private static void writeResultProjectList(SparkSession spark, String inputPath, String outputPath,`
- 2021-10-01 12:59:47 +02:00			`String graphPath) {`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`Dataset<String> funderList = Utils`
			`.readPath(spark, inputPath + "/publication", CommunityResult.class)`
			`.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))`
			`.union(Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class))`
			`.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class))`
			`.flatMap((FlatMapFunction<CommunityResult, String>) cr ->`
			`cr.getProjects().stream().map(p -> p.getFunder().getShortName()).collect(Collectors.toList()).iterator()`
			`, Encoders.STRING())`
			`.distinct();`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
[Dump Funders] - 2022-03-23 17:10:19 +01:00			`Dataset<CommunityResult> pubs;`
			`Dataset<CommunityResult> result ;`
			`pubs = Utils`
			`.readPath(spark, inputPath + "/publication", CommunityResult.class);`
			`Dataset<CommunityResult> dats = Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class);`
			`Dataset<CommunityResult> orp = Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class);`
			`Dataset<CommunityResult> sw = Utils.readPath(spark, inputPath + "/software", CommunityResult.class);`
			`result = pubs.union(dats).union(orp).union(sw);`

[Dump Funders] - 2022-03-23 16:08:14 +01:00			`funderList.foreach((ForeachFunction<String>) funder ->`
			`getFunderResult(funder, inputPath, spark)`
			`.write()`
			`.mode(SaveMode.Overwrite)`
			`.option("compression", "gzip")`
			`.json(outputPath + "/" + funder)`

			`);`
fixing issue on previous implementation 2020-11-24 14:44:53 +01:00

code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`}`

modified code to split the Croatian funder 2021-07-13 14:31:45 +02:00
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`@Nullable`
			`private static Dataset<CommunityResult> getFunderResult(String funderName, String inputPath, SparkSession spark) {`
[Dump Funders] - 2022-03-23 17:10:19 +01:00			`Dataset<CommunityResult> pubs;`
			`Dataset<CommunityResult> result ;`
			`pubs = Utils`
			`.readPath(spark, inputPath + "/publication", CommunityResult.class);`
			`Dataset<CommunityResult> dats = Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class);`
			`Dataset<CommunityResult> orp = Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class);`
			`Dataset<CommunityResult> sw = Utils.readPath(spark, inputPath + "/software", CommunityResult.class);`
			`result = pubs.union(dats).union(orp).union(sw);`
			`Dataset<CommunityResult> tmp = result.map((MapFunction<CommunityResult, CommunityResult>) cr -> {`
[Dump Funders] - 2022-03-23 16:08:14 +01:00			`if (!Optional.ofNullable(cr.getProjects()).isPresent()) {`
			`return null;`
			`}`
			`for (Project p : cr.getProjects()) {`
			`if (p.getFunder().getShortName().equalsIgnoreCase(funderName)) {`
			`return cr;`
			`}`
			`}`
			`return null;`
			`}, Encoders.bean(CommunityResult.class))`
			`.filter(Objects::nonNull);`
[Dump Funders] - 2022-03-23 17:10:19 +01:00			`System.out.println(tmp.count());`
			`return tmp;`
modified code to split the Croatian funder 2021-07-13 14:31:45 +02:00
			`}`

[Dump Funders] - 2022-03-23 16:08:14 +01:00
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`}`