dhp-graph-dump/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SparkUpdateProjectInfo.java

317 lines
9.7 KiB
Java

package eu.dnetlib.dhp.oa.graph.dump.eosc;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.eosc.model.*;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Project;
import scala.Array;
import scala.Tuple2;
public class SparkUpdateProjectInfo implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkUpdateProjectInfo.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/eosc_project_input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String preparedInfoPath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", preparedInfoPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultType = parser.get("resultType");
log.info("resultType: {}", resultType);
final String dumpType = Optional
.ofNullable(parser.get("dumpType"))
.orElse(Constants.DUMPTYPE.COMMUNITY.getType());
log.info("dumpType: {}", dumpType);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, workingPath + resultType + "extendedproject");
extend(spark, inputPath, workingPath, preparedInfoPath, outputPath, resultType);
});
}
private static void extend(
SparkSession spark,
String inputPath,
String workingPath,
String preparedInfoPath,
String outputPath,
String resultType) {
Dataset<Result> result = Utils.readPath(spark, workingPath + resultType + "extendedaffiliation", Result.class);
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
result
.joinWith(
resultProject, result.col("id").equalTo(resultProject.col("resultId")),
"left")
.map((MapFunction<Tuple2<Result, ResultProject>, Result>) value -> {
Result r = value._1();
Optional.ofNullable(value._2()).ifPresent(rp -> r.setProjects(rp.getProjectsList()));
return r;
}, Encoders.bean(Result.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.json(workingPath + resultType + "extendedproject");
Dataset<Project> project = Utils.readPath(spark, inputPath + "/project", Project.class);
Dataset<String> projectIds = result
.joinWith(resultProject, result.col("id").equalTo(resultProject.col("resultId")))
.flatMap(
(FlatMapFunction<Tuple2<Result, ResultProject>, String>) t2 -> t2
._2()
.getProjectsList()
.stream()
.map(p -> p.getId())
.collect(Collectors.toList())
.iterator(),
Encoders.STRING())
.distinct();
projectIds
.joinWith(project, projectIds.col("value").equalTo(project.col("id")))
.map(
(MapFunction<Tuple2<String, Project>, eu.dnetlib.dhp.eosc.model.Project>) t2 -> mapProject(t2._2()),
Encoders.bean(eu.dnetlib.dhp.eosc.model.Project.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + resultType + "project");
result
.joinWith(
resultProject, result.col("id").equalTo(resultProject.col("resultId")))
.map(
(MapFunction<Tuple2<Result, ResultProject>, ResultProject>) t2 -> t2._2(),
Encoders.bean(ResultProject.class))
.flatMap(
(FlatMapFunction<ResultProject, Relation>) rp -> rp
.getProjectsList()
.stream()
.map(p -> Relation.newInstance(rp.getResultId(), p.getId()))
.collect(Collectors.toList())
.iterator(),
Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + resultType + "resultProject");
}
private static eu.dnetlib.dhp.eosc.model.Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p)
throws DocumentException {
if (Boolean.TRUE.equals(p.getDataInfo().getDeletedbyinference()))
return null;
eu.dnetlib.dhp.eosc.model.Project project = new eu.dnetlib.dhp.eosc.model.Project();
Optional
.ofNullable(p.getId())
.ifPresent(id -> project.setId(id));
Optional
.ofNullable(p.getWebsiteurl())
.ifPresent(w -> project.setWebsiteurl(w.getValue()));
Optional
.ofNullable(p.getCode())
.ifPresent(code -> project.setCode(code.getValue()));
Optional
.ofNullable(p.getAcronym())
.ifPresent(acronynim -> project.setAcronym(acronynim.getValue()));
Optional
.ofNullable(p.getTitle())
.ifPresent(title -> project.setTitle(title.getValue()));
Optional
.ofNullable(p.getStartdate())
.ifPresent(sdate -> project.setStartdate(sdate.getValue()));
Optional
.ofNullable(p.getEnddate())
.ifPresent(edate -> project.setEnddate(edate.getValue()));
Optional
.ofNullable(p.getCallidentifier())
.ifPresent(cide -> project.setCallidentifier(cide.getValue()));
Optional
.ofNullable(p.getKeywords())
.ifPresent(key -> project.setKeywords(key.getValue()));
Optional<Field<String>> omandate = Optional.ofNullable(p.getOamandatepublications());
Optional<Field<String>> oecsc39 = Optional.ofNullable(p.getEcsc39());
boolean mandate = false;
if (omandate.isPresent()) {
if (omandate.get().getValue().equals("true")) {
mandate = true;
}
}
if (oecsc39.isPresent()) {
if (oecsc39.get().getValue().equals("true")) {
mandate = true;
}
}
project.setOpenaccessmandateforpublications(mandate);
project.setOpenaccessmandatefordataset(false);
Optional
.ofNullable(p.getEcarticle29_3())
.ifPresent(oamandate -> project.setOpenaccessmandatefordataset(oamandate.getValue().equals("true")));
project
.setSubject(
Optional
.ofNullable(p.getSubjects())
.map(subjs -> subjs.stream().map(s -> s.getValue()).collect(Collectors.toList()))
.orElse(new ArrayList<>()));
Optional
.ofNullable(p.getSummary())
.ifPresent(summary -> project.setSummary(summary.getValue()));
Optional<Float> ofundedamount = Optional.ofNullable(p.getFundedamount());
Optional<Field<String>> ocurrency = Optional.ofNullable(p.getCurrency());
Optional<Float> ototalcost = Optional.ofNullable(p.getTotalcost());
if (ocurrency.isPresent()) {
if (ofundedamount.isPresent()) {
if (ototalcost.isPresent()) {
project
.setGranted(
Granted.newInstance(ocurrency.get().getValue(), ototalcost.get(), ofundedamount.get()));
} else {
project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ofundedamount.get()));
}
}
}
project
.setH2020programme(
Optional
.ofNullable(p.getH2020classification())
.map(
classification -> classification
.stream()
.map(
c -> Programme
.newInstance(
c.getH2020Programme().getCode(), c.getH2020Programme().getDescription()))
.collect(Collectors.toList()))
.orElse(new ArrayList<>()));
Optional<List<Field<String>>> ofundTree = Optional
.ofNullable(p.getFundingtree());
List<Funder> funList = new ArrayList<>();
if (ofundTree.isPresent()) {
for (Field<String> fundingtree : ofundTree.get()) {
funList.add(getFunder(fundingtree.getValue()));
}
}
project.setFunding(funList);
return project;
}
public static Funder getFunder(String fundingtree) throws DocumentException {
Funder f = new Funder();
final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree));
f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
f.setName(((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText());
f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
String id = "";
StringBuilder bld = new StringBuilder();
int level = 0;
List<org.dom4j.Node> nodes = doc.selectNodes("//funding_level_" + level);
while (!nodes.isEmpty()) {
for (org.dom4j.Node n : nodes) {
List node = n.selectNodes("./id");
id = ((org.dom4j.Node) node.get(0)).getText();
id = id.substring(id.indexOf("::") + 2);
node = n.selectNodes("./description");
bld.append(((Node) node.get(0)).getText() + " - ");
}
level += 1;
nodes = doc.selectNodes("//funding_level_" + level);
}
String description = bld.toString();
if (!id.equals("")) {
Fundings fundings = new Fundings();
fundings.setId(id);
fundings.setDescription(description.substring(0, description.length() - 3).trim());
f.setFunding_stream(fundings);
}
return f;
}
}