package eu.dnetlib.dhp.oa.graph.dump.skgif; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; import org.apache.avro.generic.GenericData; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.skgif.model.Grant; import eu.dnetlib.dhp.skgif.model.Identifier; import eu.dnetlib.dhp.skgif.model.Prefixes; import eu.dnetlib.dhp.skgif.model.RelationType; import scala.Tuple2; /** * @author miriam.baglioni * @Date 22/02/24 */ public class DumpGrant implements Serializable { private static final Logger log = LoggerFactory.getLogger(DumpGrant.class); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( DumpGrant.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/dump/dump_grant_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String workingDir = parser.get("workingDir"); log.info("workingDir: {}", workingDir); final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath + "Grant"); mapGrants(spark, inputPath, outputPath); }); } private static void mapGrants(SparkSession spark, String inputPath, String outputPath) { Dataset projects = Utils .readPath(spark, inputPath + "project", Project.class) .filter( (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible()); Dataset relations = Utils .readPath(spark, inputPath + "relation", Relation.class) .filter( (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible() && r.getRelClass().equalsIgnoreCase(RelationType.ORGANIZATION_PARTICIPANT_IN_PROJECT.label)); projects .joinWith(relations, projects.col("id").equalTo(relations.col("target")), "left") .groupByKey((MapFunction, String>) t2 -> t2._1().getId(), Encoders.STRING()) .mapGroups((MapGroupsFunction, Grant>) (k, v) -> { Grant g = new Grant(); Tuple2 first = v.next(); g.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, k)); g.setIdentifiers(getProjectIdentifier(first._1())); g.setTitle(first._1().getTitle().getValue()); g .setSummary( Optional .ofNullable(first._1().getSummary()) .map(value -> value.getValue()) .orElse(new String())); g .setAcronym( Optional .ofNullable(first._1().getAcronym()) .map(value -> value.getValue()) .orElse(new String())); g.setFunder(getFunderName(first._1().getFundingtree().get(0).getValue())); // * private String funding_stream;// fundingtree to be used the xpath //funding_level_[n] g.setFunding_stream(getFundingStream(first._1().getFundingtree().get(0).getValue())); g .setCurrency( Optional .ofNullable(first._1().getCurrency()) .map(value -> value.getValue()) .orElse(new String())); g .setFunded_amount( Optional .ofNullable(first._1().getFundedamount()) .orElse(null)); g .setKeywords( first ._1() .getSubjects() .stream() .map(s -> s.getValue()) .collect(Collectors.toList())); g .setStart_date( Optional .ofNullable(first._1().getStartdate()) .map(value -> value.getValue()) .orElse(new String())); g .setEnd_date( Optional .ofNullable(first._1().getEnddate()) .map(value -> value.getValue()) .orElse(new String())); g .setWebsite( Optional .ofNullable(first._1().getWebsiteurl()) .map(value -> value.getValue()) .orElse(new String())); if (Optional.ofNullable(first._2()).isPresent()) { List relevantOrganizatios = new ArrayList<>(); relevantOrganizatios.add(Utils.getIdentifier(Prefixes.ORGANIZATION, first._2().getSource())); v .forEachRemaining( t2 -> relevantOrganizatios .add(Utils.getIdentifier(Prefixes.ORGANIZATION, t2._2().getSource()))); g.setBeneficiaries(relevantOrganizatios); } return g; }, Encoders.bean(Grant.class)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + "Grant"); } private static String getFundingStream(String fundingtree) throws DocumentException { final Document doc; doc = new SAXReader().read(new StringReader(fundingtree)); if (Optional.ofNullable(doc.selectNodes("//funding_level_0")).isPresent() && doc.selectNodes("//funding_level_0").size() > 0) return ((org.dom4j.Node) (doc.selectNodes("//funding_level_0").get(0))).getText(); return new String(); } private static String getFunderName(String fundingtree) throws DocumentException { final Document doc; doc = new SAXReader().read(new StringReader(fundingtree)); // f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText()); return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText(); // f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText()); } private static List getProjectIdentifier(Project project) throws DocumentException { List identifiers = new ArrayList<>(); if (project.getPid().size() > 0) project .getPid() .stream() .forEach(p -> identifiers.add(Identifier.newInstance(p.getQualifier().getClassid(), p.getValue()))); identifiers .add( Identifier .newInstance( getFunderName(project.getFundingtree().get(0).getValue()), project.getCode().getValue())); return identifiers; } }