dhp-graph-dump/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpGrant.java


package eu.dnetlib.dhp.oa.graph.dump.skgif;

import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

import org.apache.avro.generic.GenericData;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.skgif.model.Grant;
import eu.dnetlib.dhp.skgif.model.Identifier;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.skgif.model.RelationType;
import scala.Tuple2;

/**
 * @author miriam.baglioni
 * @Date 22/02/24
 */
public class DumpGrant implements Serializable {
	private static final Logger log = LoggerFactory.getLogger(DumpGrant.class);

	public static void main(String[] args) throws Exception {
		String jsonConfiguration = IOUtils
			.toString(
				DumpGrant.class
					.getResourceAsStream(
						"/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json"));

		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
		parser.parseArgument(args);

		Boolean isSparkSessionManaged = Optional
			.ofNullable(parser.get("isSparkSessionManaged"))
			.map(Boolean::valueOf)
			.orElse(Boolean.TRUE);

		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

		final String inputPath = parser.get("sourcePath");
		log.info("inputPath: {}", inputPath);

		final String workingDir = parser.get("workingDir");
		log.info("workingDir: {}", workingDir);

		final String outputPath = parser.get("outputPath");
		log.info("outputPath: {}", outputPath);

		SparkConf conf = new SparkConf();

		runWithSparkSession(
			conf,
			isSparkSessionManaged,
			spark -> {
				Utils.removeOutputDir(spark, outputPath + "Grant");

				mapGrants(spark, inputPath, outputPath);
			});
	}

	private static void mapGrants(SparkSession spark, String inputPath, String outputPath) {
		Dataset<Project> projects = Utils
			.readPath(spark, inputPath + "project", Project.class)
			.filter(
				(FilterFunction<Project>) p -> !p.getDataInfo().getDeletedbyinference() &&
					!p.getDataInfo().getInvisible());
		Dataset<Relation> relations = Utils
			.readPath(spark, inputPath + "relation", Relation.class)
			.filter(
				(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
					!r.getDataInfo().getInvisible() &&
					r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label));
		projects
			.joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left")
			.groupByKey((MapFunction<Tuple2<Project, Relation>, String>) t2 -> t2._1().getId(), Encoders.STRING())
			.mapGroups((MapGroupsFunction<String, Tuple2<Project, Relation>, Grant>) (k, v) -> {
				Grant g = new Grant();
				Tuple2<Project, Relation> first = v.next();
				g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k));
				g.setIdentifiers(getProjectIdentifier(first._1()));
				g.setTitle(first._1().getTitle().getValue());
				g
					.setSummary(
						Optional
							.ofNullable(first._1().getSummary())
							.map(value -> value.getValue())
							.orElse(new String()));
				g
					.setAcronym(
						Optional
							.ofNullable(first._1().getAcronym())
							.map(value -> value.getValue())
							.orElse(new String()));
				g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue()));
				// * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n]
				g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue()));
				g
					.setCurrency(
						Optional
							.ofNullable(first._1().getCurrency())
							.map(value -> value.getValue())
							.orElse(new String()));
				g
					.setFunded_amount(
						Optional
							.ofNullable(first._1().getFundedamount())
							.orElse(null));
				g
					.setKeywords(
						first
							._1()
							.getSubjects()
							.stream()
							.map(s -> s.getValue())
							.collect(Collectors.toList()));
				g
					.setStart_date(
						Optional
							.ofNullable(first._1().getStartdate())
							.map(value -> value.getValue())
							.orElse(new String()));
				g
					.setEnd_date(
						Optional
							.ofNullable(first._1().getEnddate())
							.map(value -> value.getValue())
							.orElse(new String()));
				g
					.setWebsite(
						Optional
							.ofNullable(first._1().getWebsiteurl())
							.map(value -> value.getValue())
							.orElse(new String()));
				if (Optional.ofNullable(first._2()).isPresent()) {
					List<String> relevantOrganizatios = new ArrayList<>();
					relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource()));
					v
						.forEachRemaining(
							t2 -> relevantOrganizatios
								.add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource())));
					g.setBeneficiaries(relevantOrganizatios);
				}
				return g;
			}, Encoders.bean(Grant.class))
			.write()
			.mode(SaveMode.Overwrite)
			.option("compression", "gzip")
			.json(outputPath + "Grant");
	}

	private static String getFundingStream(String fundingtree) throws DocumentException {
		final Document doc;

		doc = new SAXReader().read(new StringReader(fundingtree));
		if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() &&
			doc.selectNodes("//funding_level_0").size() > 0)
			return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText();
		return new String();

	}

	private static String getFunderName(String fundingtree) throws DocumentException {
		final Document doc;

		doc = new SAXReader().read(new StringReader(fundingtree));
		// f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
		return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText();
		// f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());

	}

	private static List<Identifier> getProjectIdentifier(Project project) throws DocumentException {
		List<Identifier> identifiers = new ArrayList<>();
		if (project.getPid().size() > 0)
			project
				.getPid()
				.stream()
				.forEach(p -> identifiers.add(Identifier.newInstance(p.getQualifier().getClassid(), p.getValue())));
		identifiers
			.add(
				Identifier
					.newInstance(
						getFunderName(project.getFundingtree().get(0).getValue()), project.getCode().getValue()));
		return identifiers;

	}
}