317 lines
9.7 KiB
Java
317 lines
9.7 KiB
Java
|
|
package eu.dnetlib.dhp.oa.graph.dump.eosc;
|
|
|
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
|
|
import java.io.Serializable;
|
|
import java.io.StringReader;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Optional;
|
|
import java.util.stream.Collectors;
|
|
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.spark.SparkConf;
|
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
|
import org.apache.spark.api.java.function.ForeachFunction;
|
|
import org.apache.spark.api.java.function.MapFunction;
|
|
import org.apache.spark.sql.*;
|
|
import org.dom4j.Document;
|
|
import org.dom4j.DocumentException;
|
|
import org.dom4j.Node;
|
|
import org.dom4j.io.SAXReader;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
import eu.dnetlib.dhp.eosc.model.*;
|
|
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
|
import scala.Array;
|
|
import scala.Tuple2;
|
|
|
|
public class SparkUpdateProjectInfo implements Serializable {
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class);
|
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
String jsonConfiguration = IOUtils
|
|
.toString(
|
|
SparkUpdateProjectInfo.class
|
|
.getResourceAsStream(
|
|
"/eu/dnetlib/dhp/oa/graph/dump/eosc_project_input_parameters.json"));
|
|
|
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
|
parser.parseArgument(args);
|
|
|
|
Boolean isSparkSessionManaged = Optional
|
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
.map(Boolean::valueOf)
|
|
.orElse(Boolean.TRUE);
|
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
|
|
final String inputPath = parser.get("sourcePath");
|
|
log.info("inputPath: {}", inputPath);
|
|
|
|
final String workingPath = parser.get("workingPath");
|
|
log.info("workingPath: {}", workingPath);
|
|
|
|
final String preparedInfoPath = parser.get("preparedInfoPath");
|
|
log.info("preparedInfoPath: {}", preparedInfoPath);
|
|
|
|
final String outputPath = parser.get("outputPath");
|
|
log.info("outputPath: {}", outputPath);
|
|
|
|
final String resultType = parser.get("resultType");
|
|
log.info("resultType: {}", resultType);
|
|
|
|
final String dumpType = Optional
|
|
.ofNullable(parser.get("dumpType"))
|
|
.orElse(Constants.DUMPTYPE.COMMUNITY.getType());
|
|
log.info("dumpType: {}", dumpType);
|
|
|
|
SparkConf conf = new SparkConf();
|
|
|
|
runWithSparkSession(
|
|
conf,
|
|
isSparkSessionManaged,
|
|
spark -> {
|
|
Utils.removeOutputDir(spark, workingPath + resultType + "extendedproject");
|
|
extend(spark, inputPath, workingPath, preparedInfoPath, outputPath, resultType);
|
|
});
|
|
}
|
|
|
|
private static void extend(
|
|
SparkSession spark,
|
|
String inputPath,
|
|
String workingPath,
|
|
String preparedInfoPath,
|
|
String outputPath,
|
|
String resultType) {
|
|
|
|
Dataset<Result> result = Utils.readPath(spark, workingPath + resultType + "extendedaffiliation", Result.class);
|
|
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
|
|
result
|
|
.joinWith(
|
|
resultProject, result.col("id").equalTo(resultProject.col("resultId")),
|
|
"left")
|
|
.map((MapFunction<Tuple2<Result, ResultProject>, Result>) value -> {
|
|
Result r = value._1();
|
|
Optional.ofNullable(value._2()).ifPresent(rp -> r.setProjects(rp.getProjectsList()));
|
|
return r;
|
|
}, Encoders.bean(Result.class))
|
|
.write()
|
|
.option("compression", "gzip")
|
|
.mode(SaveMode.Append)
|
|
.json(workingPath + resultType + "extendedproject");
|
|
|
|
Dataset<Project> project = Utils.readPath(spark, inputPath + "/project", Project.class);
|
|
|
|
Dataset<String> projectIds = result
|
|
.joinWith(resultProject, result.col("id").equalTo(resultProject.col("resultId")))
|
|
.flatMap(
|
|
(FlatMapFunction<Tuple2<Result, ResultProject>, String>) t2 -> t2
|
|
._2()
|
|
.getProjectsList()
|
|
.stream()
|
|
.map(p -> p.getId())
|
|
.collect(Collectors.toList())
|
|
.iterator(),
|
|
Encoders.STRING())
|
|
.distinct();
|
|
|
|
projectIds
|
|
.joinWith(project, projectIds.col("value").equalTo(project.col("id")))
|
|
.map(
|
|
(MapFunction<Tuple2<String, Project>, eu.dnetlib.dhp.eosc.model.Project>) t2 -> mapProject(t2._2()),
|
|
Encoders.bean(eu.dnetlib.dhp.eosc.model.Project.class))
|
|
.write()
|
|
.mode(SaveMode.Overwrite)
|
|
.option("compression", "gzip")
|
|
.json(workingPath + resultType + "project");
|
|
|
|
result
|
|
.joinWith(
|
|
resultProject, result.col("id").equalTo(resultProject.col("resultId")))
|
|
.map(
|
|
(MapFunction<Tuple2<Result, ResultProject>, ResultProject>) t2 -> t2._2(),
|
|
Encoders.bean(ResultProject.class))
|
|
.flatMap(
|
|
(FlatMapFunction<ResultProject, Relation>) rp -> rp
|
|
.getProjectsList()
|
|
.stream()
|
|
.map(p -> Relation.newInstance(rp.getResultId(), p.getId()))
|
|
.collect(Collectors.toList())
|
|
.iterator(),
|
|
Encoders.bean(Relation.class))
|
|
.write()
|
|
.mode(SaveMode.Overwrite)
|
|
.option("compression", "gzip")
|
|
.json(workingPath + resultType + "resultProject");
|
|
}
|
|
|
|
private static eu.dnetlib.dhp.eosc.model.Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p)
|
|
throws DocumentException {
|
|
if (Boolean.TRUE.equals(p.getDataInfo().getDeletedbyinference()))
|
|
return null;
|
|
|
|
eu.dnetlib.dhp.eosc.model.Project project = new eu.dnetlib.dhp.eosc.model.Project();
|
|
|
|
Optional
|
|
.ofNullable(p.getId())
|
|
.ifPresent(id -> project.setId(id));
|
|
|
|
Optional
|
|
.ofNullable(p.getWebsiteurl())
|
|
.ifPresent(w -> project.setWebsiteurl(w.getValue()));
|
|
|
|
Optional
|
|
.ofNullable(p.getCode())
|
|
.ifPresent(code -> project.setCode(code.getValue()));
|
|
|
|
Optional
|
|
.ofNullable(p.getAcronym())
|
|
.ifPresent(acronynim -> project.setAcronym(acronynim.getValue()));
|
|
|
|
Optional
|
|
.ofNullable(p.getTitle())
|
|
.ifPresent(title -> project.setTitle(title.getValue()));
|
|
|
|
Optional
|
|
.ofNullable(p.getStartdate())
|
|
.ifPresent(sdate -> project.setStartdate(sdate.getValue()));
|
|
|
|
Optional
|
|
.ofNullable(p.getEnddate())
|
|
.ifPresent(edate -> project.setEnddate(edate.getValue()));
|
|
|
|
Optional
|
|
.ofNullable(p.getCallidentifier())
|
|
.ifPresent(cide -> project.setCallidentifier(cide.getValue()));
|
|
|
|
Optional
|
|
.ofNullable(p.getKeywords())
|
|
.ifPresent(key -> project.setKeywords(key.getValue()));
|
|
|
|
Optional<Field<String>> omandate = Optional.ofNullable(p.getOamandatepublications());
|
|
Optional<Field<String>> oecsc39 = Optional.ofNullable(p.getEcsc39());
|
|
boolean mandate = false;
|
|
if (omandate.isPresent()) {
|
|
if (omandate.get().getValue().equals("true")) {
|
|
mandate = true;
|
|
}
|
|
}
|
|
if (oecsc39.isPresent()) {
|
|
if (oecsc39.get().getValue().equals("true")) {
|
|
mandate = true;
|
|
}
|
|
}
|
|
|
|
project.setOpenaccessmandateforpublications(mandate);
|
|
project.setOpenaccessmandatefordataset(false);
|
|
|
|
Optional
|
|
.ofNullable(p.getEcarticle29_3())
|
|
.ifPresent(oamandate -> project.setOpenaccessmandatefordataset(oamandate.getValue().equals("true")));
|
|
|
|
project
|
|
.setSubject(
|
|
Optional
|
|
.ofNullable(p.getSubjects())
|
|
.map(subjs -> subjs.stream().map(s -> s.getValue()).collect(Collectors.toList()))
|
|
.orElse(new ArrayList<>()));
|
|
|
|
Optional
|
|
.ofNullable(p.getSummary())
|
|
.ifPresent(summary -> project.setSummary(summary.getValue()));
|
|
|
|
Optional<Float> ofundedamount = Optional.ofNullable(p.getFundedamount());
|
|
Optional<Field<String>> ocurrency = Optional.ofNullable(p.getCurrency());
|
|
Optional<Float> ototalcost = Optional.ofNullable(p.getTotalcost());
|
|
|
|
if (ocurrency.isPresent()) {
|
|
if (ofundedamount.isPresent()) {
|
|
if (ototalcost.isPresent()) {
|
|
project
|
|
.setGranted(
|
|
Granted.newInstance(ocurrency.get().getValue(), ototalcost.get(), ofundedamount.get()));
|
|
} else {
|
|
project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ofundedamount.get()));
|
|
}
|
|
}
|
|
}
|
|
|
|
project
|
|
.setH2020programme(
|
|
Optional
|
|
.ofNullable(p.getH2020classification())
|
|
.map(
|
|
classification -> classification
|
|
.stream()
|
|
.map(
|
|
c -> Programme
|
|
.newInstance(
|
|
c.getH2020Programme().getCode(), c.getH2020Programme().getDescription()))
|
|
.collect(Collectors.toList()))
|
|
.orElse(new ArrayList<>()));
|
|
|
|
Optional<List<Field<String>>> ofundTree = Optional
|
|
.ofNullable(p.getFundingtree());
|
|
List<Funder> funList = new ArrayList<>();
|
|
if (ofundTree.isPresent()) {
|
|
for (Field<String> fundingtree : ofundTree.get()) {
|
|
funList.add(getFunder(fundingtree.getValue()));
|
|
}
|
|
}
|
|
project.setFunding(funList);
|
|
|
|
return project;
|
|
}
|
|
|
|
public static Funder getFunder(String fundingtree) throws DocumentException {
|
|
Funder f = new Funder();
|
|
final Document doc;
|
|
|
|
doc = new SAXReader().read(new StringReader(fundingtree));
|
|
f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
|
|
f.setName(((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText());
|
|
f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
|
|
|
|
String id = "";
|
|
|
|
StringBuilder bld = new StringBuilder();
|
|
|
|
int level = 0;
|
|
List<org.dom4j.Node> nodes = doc.selectNodes("//funding_level_" + level);
|
|
while (!nodes.isEmpty()) {
|
|
for (org.dom4j.Node n : nodes) {
|
|
|
|
List node = n.selectNodes("./id");
|
|
id = ((org.dom4j.Node) node.get(0)).getText();
|
|
id = id.substring(id.indexOf("::") + 2);
|
|
|
|
node = n.selectNodes("./description");
|
|
bld.append(((Node) node.get(0)).getText() + " - ");
|
|
|
|
}
|
|
level += 1;
|
|
nodes = doc.selectNodes("//funding_level_" + level);
|
|
}
|
|
String description = bld.toString();
|
|
if (!id.equals("")) {
|
|
Fundings fundings = new Fundings();
|
|
fundings.setId(id);
|
|
fundings.setDescription(description.substring(0, description.length() - 3).trim());
|
|
f.setFunding_stream(fundings);
|
|
}
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
}
|