BrBETA_dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java


package eu.dnetlib.dhp.oa.graph.dump.funderresults;

import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

import java.io.Serializable;
import java.util.*;

import eu.dnetlib.dhp.oa.graph.dump.Constants;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.api.zenodo.Community;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;

/**
 * Splits the dumped results by funder and stores them in a folder named as the funder nsp (for all the funders, but the EC
 * for the EC it specifies also the fundingStream (FP7 or H2020)
 */
public class SparkDumpFunderResults implements Serializable {
	private static final Logger log = LoggerFactory.getLogger(SparkDumpFunderResults.class);

	public static void main(String[] args) throws Exception {
		String jsonConfiguration = IOUtils
			.toString(
				SparkDumpFunderResults.class
					.getResourceAsStream(
						"/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json"));

		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
		parser.parseArgument(args);

		Boolean isSparkSessionManaged = Optional
			.ofNullable(parser.get("isSparkSessionManaged"))
			.map(Boolean::valueOf)
			.orElse(Boolean.TRUE);
		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

		final String inputPath = parser.get("sourcePath");
		log.info("inputPath: {}", inputPath);

		final String outputPath = parser.get("outputPath");
		log.info("outputPath: {}", outputPath);

		final String relationPath = parser.get("relationPath");
		log.info("relationPath: {}", relationPath);

		SparkConf conf = new SparkConf();

		runWithSparkSession(
			conf,
			isSparkSessionManaged,
			spark -> {
				Utils.removeOutputDir(spark, outputPath);
				writeResultProjectList(spark, inputPath, outputPath, relationPath);
			});
	}

	private static void writeResultProjectList(SparkSession spark, String inputPath, String outputPath,
		String relationPath) {

		Dataset<Relation> relation = Utils
			.readPath(spark, relationPath + "/relation", Relation.class)
			.filter("dataInfo.deletedbyinference = false and lower(relClass) = '" + Constants.RESULT_PROJECT_IS_PRODUCED_BY.toLowerCase()+ "'");

		Dataset<CommunityResult> result = Utils
			.readPath(spark, inputPath + "/publication", CommunityResult.class)
			.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
			.union(Utils.readPath(spark, inputPath + "/orp", CommunityResult.class))
			.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));

		List<String> funderList = relation
			.select("target")
			.map((MapFunction<Row, String>) value -> value.getString(0).substring(0, 15), Encoders.STRING())
			.distinct()
			.collectAsList();


		funderList.forEach(funder -> {
			String fundernsp = funder.substring(3);
			String funderdump;
			if (fundernsp.startsWith("corda")){
				funderdump = "EC_";
				if(fundernsp.endsWith("h2020")){
					funderdump += "H2020";
				}else{
					funderdump += "FP7";
				}
			}else{
				funderdump = fundernsp.substring(0, fundernsp.indexOf("_")).toUpperCase();
			}
			writeFunderResult(funder, result, outputPath + "/" + funderdump);
		});

	}

	private static void writeFunderResult(String funder, Dataset<CommunityResult> results, String outputPath) {

		results.map((MapFunction<CommunityResult, CommunityResult>) r -> {
			if (!Optional.ofNullable(r.getProjects()).isPresent()) {
				return null;
			}
			for (Project p : r.getProjects()) {
				if (p.getId().startsWith(funder)) {
					return r;
				}
			}
			return null;
		}, Encoders.bean(CommunityResult.class))
			.filter(Objects::nonNull)
			.write()
			.mode(SaveMode.Overwrite)
			.option("compression", "gzip")
			.json(outputPath);
	}

}
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`package eu.dnetlib.dhp.oa.graph.dump.funderresults;`

			`import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;`

			`import java.io.Serializable;`
			`import java.util.*;`

refactoring and addition of the funder nsp first part as nome for the dump insteasd of the whole nsp 2020-11-25 13:45:41 +01:00			`import eu.dnetlib.dhp.oa.graph.dump.Constants;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`import org.apache.commons.io.IOUtils;`
			`import org.apache.spark.SparkConf;`
			`import org.apache.spark.api.java.function.MapFunction;`
fixing issue on previous implementation 2020-11-24 14:44:53 +01:00			`import org.apache.spark.sql.*;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser;`
new logic and workflow for dump of results with link to projects. In this implementation the result match the model of the communityresult. 2020-11-19 19:15:39 +01:00			`import eu.dnetlib.dhp.common.api.zenodo.Community;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;`
			`import eu.dnetlib.dhp.oa.graph.dump.Utils;`
			`import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;`
new logic and workflow for dump of results with link to projects. In this implementation the result match the model of the communityresult. 2020-11-19 19:15:39 +01:00			`import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;`
fixing issue on previous implementation 2020-11-24 14:44:53 +01:00			`import eu.dnetlib.dhp.schema.dump.oaf.community.Project;`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`import eu.dnetlib.dhp.schema.oaf.Relation;`
			`import scala.Tuple2;`

			`/**`
added java doc 2020-11-25 14:08:09 +01:00			`* Splits the dumped results by funder and stores them in a folder named as the funder nsp (for all the funders, but the EC`
			`* for the EC it specifies also the fundingStream (FP7 or H2020)`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`*/`
- 2020-11-18 18:56:48 +01:00			`public class SparkDumpFunderResults implements Serializable {`
			`private static final Logger log = LoggerFactory.getLogger(SparkDumpFunderResults.class);`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`public static void main(String[] args) throws Exception {`
			`String jsonConfiguration = IOUtils`
			`.toString(`
- 2020-11-18 18:56:48 +01:00			`SparkDumpFunderResults.class`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`.getResourceAsStream(`
changed parameter file with the ono associated to the job 2020-11-18 16:58:20 +01:00			`"/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json"));`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);`
			`parser.parseArgument(args);`

			`Boolean isSparkSessionManaged = Optional`
			`.ofNullable(parser.get("isSparkSessionManaged"))`
			`.map(Boolean::valueOf)`
			`.orElse(Boolean.TRUE);`
			`log.info("isSparkSessionManaged: {}", isSparkSessionManaged);`

			`final String inputPath = parser.get("sourcePath");`
			`log.info("inputPath: {}", inputPath);`

			`final String outputPath = parser.get("outputPath");`
			`log.info("outputPath: {}", outputPath);`

new logic and workflow for dump of results with link to projects. In this implementation the result match the model of the communityresult. 2020-11-19 19:15:39 +01:00			`final String relationPath = parser.get("relationPath");`
			`log.info("relationPath: {}", relationPath);`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`SparkConf conf = new SparkConf();`

			`runWithSparkSession(`
			`conf,`
			`isSparkSessionManaged,`
			`spark -> {`
			`Utils.removeOutputDir(spark, outputPath);`
fixed issue in path name 2020-11-20 12:32:23 +01:00			`writeResultProjectList(spark, inputPath, outputPath, relationPath);`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`});`
			`}`

fixing issue on previous implementation 2020-11-24 14:44:53 +01:00			`private static void writeResultProjectList(SparkSession spark, String inputPath, String outputPath,`
			`String relationPath) {`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
			`Dataset<Relation> relation = Utils`
fixed issue in path name 2020-11-20 12:32:23 +01:00			`.readPath(spark, relationPath + "/relation", Relation.class)`
refactoring and addition of the funder nsp first part as nome for the dump insteasd of the whole nsp 2020-11-25 13:45:41 +01:00			`.filter("dataInfo.deletedbyinference = false and lower(relClass) = '" + Constants.RESULT_PROJECT_IS_PRODUCED_BY.toLowerCase()+ "'");`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
new logic and workflow for dump of results with link to projects. In this implementation the result match the model of the communityresult. 2020-11-19 19:15:39 +01:00			`Dataset<CommunityResult> result = Utils`
			`.readPath(spark, inputPath + "/publication", CommunityResult.class)`
			`.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))`
changed directory name 2020-11-24 16:47:07 +01:00			`.union(Utils.readPath(spark, inputPath + "/orp", CommunityResult.class))`
new logic and workflow for dump of results with link to projects. In this implementation the result match the model of the communityresult. 2020-11-19 19:15:39 +01:00			`.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00
fixing issue on previous implementation 2020-11-24 14:44:53 +01:00			`List<String> funderList = relation`
			`.select("target")`
			`.map((MapFunction<Row, String>) value -> value.getString(0).substring(0, 15), Encoders.STRING())`
			`.distinct()`
			`.collectAsList();`


refactoring and addition of the funder nsp first part as nome for the dump insteasd of the whole nsp 2020-11-25 13:45:41 +01:00			`funderList.forEach(funder -> {`
			`String fundernsp = funder.substring(3);`
			`String funderdump;`
			`if (fundernsp.startsWith("corda")){`
			`funderdump = "EC_";`
			`if(fundernsp.endsWith("h2020")){`
			`funderdump += "H2020";`
			`}else{`
			`funderdump += "FP7";`
			`}`
			`}else{`
			`funderdump = fundernsp.substring(0, fundernsp.indexOf("_")).toUpperCase();`
			`}`
			`writeFunderResult(funder, result, outputPath + "/" + funderdump);`
			`});`
fixing issue on previous implementation 2020-11-24 14:44:53 +01:00
			`}`

			`private static void writeFunderResult(String funder, Dataset<CommunityResult> results, String outputPath) {`

			`results.map((MapFunction<CommunityResult, CommunityResult>) r -> {`
			`if (!Optional.ofNullable(r.getProjects()).isPresent()) {`
			`return null;`
			`}`
			`for (Project p : r.getProjects()) {`
			`if (p.getId().startsWith(funder)) {`
			`return r;`
			`}`
			`}`
			`return null;`
			`}, Encoders.bean(CommunityResult.class))`
			`.filter(Objects::nonNull)`
new logic and workflow for dump of results with link to projects. In this implementation the result match the model of the communityresult. 2020-11-19 19:15:39 +01:00			`.write()`
			`.mode(SaveMode.Overwrite)`
fixing issue on previous implementation 2020-11-24 14:44:53 +01:00			`.option("compression", "gzip")`
refactoring and addition of the funder nsp first part as nome for the dump insteasd of the whole nsp 2020-11-25 13:45:41 +01:00			`.json(outputPath);`
code, workflow and parameters for the dump of the results associated to funders 2020-11-18 16:47:31 +01:00			`}`

			`}`