package eu.dnetlib.dhp.oa.graph.dump.skgif; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; import org.apache.avro.generic.GenericData; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.*; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EncloseMinElement; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.ExtendingOrganization; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.GrantRelation; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.skgif.model.*; import scala.Tuple2; /** * @author miriam.baglioni * @Date 22/02/24 */ public class DumpGrant implements Serializable { private static final Logger log = LoggerFactory.getLogger(DumpGrant.class); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( DumpGrant.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/dump/skgif/dump_grant_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String workingDir = parser.get("workingDir"); log.info("workingDir: {}", workingDir); final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath + "grants"); mapGrants(spark, inputPath, outputPath, workingDir); }); } private static void mapGrants(SparkSession spark, String inputPath, String outputPath, String workingDir) { Dataset projects = Utils .readPath(spark, inputPath + "project", Project.class) .filter( (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible()); Dataset partecipatingOrgs = Utils .readPath(spark, workingDir + "relations/project_partecipating_organization", ExtendingOrganization.class); projects = projects .groupByKey((MapFunction) p -> p.getId(), Encoders.STRING()) .mapGroups((MapGroupsFunction) (k, v) -> v.next(), Encoders.bean(Project.class)); projects .joinWith(partecipatingOrgs, projects.col("id").equalTo(partecipatingOrgs.col("entityId")), "left") .map((MapFunction, Grant>) t2 -> { Grant g = dumpGrant(t2._1()); if (Optional.ofNullable(t2._2()).isPresent()) g.setBeneficiaries(t2._2().getRelevant_organization()); return g; }, Encoders.bean(Grant.class)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + "grants"); } private static Grant dumpGrant(Project project) throws DocumentException { Grant g = new Grant(); g.setLocal_identifier(project.getId()); g.setGrantCode(project.getCode().getValue()); g.setIdentifiers(getProjectIdentifier(project)); if (Optional.ofNullable(project.getTitle()).isPresent()) g.setTitle(project.getTitle().getValue()); g .setSummary( Optional .ofNullable(project.getSummary()) .map(value -> value.getValue()) .orElse(new String())); g .setAcronym( Optional .ofNullable(project.getAcronym()) .map(value -> value.getValue()) .orElse(new String())); if (Optional.ofNullable(project.getFundingtree()).isPresent() && project.getFundingtree().size() > 0) { g.setFunder(Utils.getFunderName(project.getFundingtree().get(0).getValue())); // * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n] g.setFunding_stream(getFundingStream(project.getFundingtree().get(0).getValue())); } g .setCurrency( Optional .ofNullable(project.getCurrency()) .map(value -> value.getValue()) .orElse(new String())); g .setFunded_amount( Optional .ofNullable(project.getFundedamount()) .orElse(null)); if (Optional.ofNullable(project.getSubjects()).isPresent()) g .setKeywords( project .getSubjects() .stream() .map(s -> s.getValue()) .collect(Collectors.toList())); g .setStart_date( Optional .ofNullable(project.getStartdate()) .map(value -> value.getValue()) .orElse(new String())); g .setEnd_date( Optional .ofNullable(project.getEnddate()) .map(value -> value.getValue()) .orElse(new String())); g .setWebsite( Optional .ofNullable(project.getWebsiteurl()) .map(value -> value.getValue()) .orElse(new String())); return g; } private static String getFundingStream(String fundingtree) throws DocumentException { final Document doc; doc = new SAXReader().read(new StringReader(fundingtree)); if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() && doc.selectNodes("//funding_level_0").size() > 0 && Optional.ofNullable(doc.selectNodes("//funding_level_0/name")).isPresent() && doc.selectNodes("//funding_level_0/name").size() > 0) return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0/name").get(0))).getText(); return new String(); } private static List getProjectIdentifier(Project project) throws DocumentException { List identifiers = new ArrayList<>(); if (project.getPid().size() > 0) project .getPid() .stream() .forEach(p -> identifiers.add(Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))); identifiers .add( Identifier .newInstance( Utils.getFunderName(project.getFundingtree().get(0).getValue()), project.getCode().getValue())); return identifiers; } }