refactoring

This commit is contained in:
Miriam Baglioni 2020-07-29 16:52:44 +02:00
parent a8d65b68cb
commit b71d12cf26
4 changed files with 72 additions and 75 deletions

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;

View File

@ -4,16 +4,16 @@ package eu.dnetlib.dhp.oa.graph.dump.community;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class SparkDumpCommunityProducts implements Serializable { public class SparkDumpCommunityProducts implements Serializable {
@ -48,8 +48,6 @@ public class SparkDumpCommunityProducts implements Serializable {
final String isLookUpUrl = parser.get("isLookUpUrl"); final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("isLookUpUrl: {}", isLookUpUrl); log.info("isLookUpUrl: {}", isLookUpUrl);
final Optional<String> cm = Optional.ofNullable(parser.get("communityMap"));
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
queryInformationSystem = new QueryInformationSystem(); queryInformationSystem = new QueryInformationSystem();
@ -58,11 +56,8 @@ public class SparkDumpCommunityProducts implements Serializable {
DumpProducts dump = new DumpProducts(); DumpProducts dump = new DumpProducts();
dump.run(isSparkSessionManaged, inputPath, outputPath, communityMap, inputClazz, false); dump.run(isSparkSessionManaged, inputPath, outputPath, communityMap, inputClazz, CommunityResult.class, false);
} }
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -8,9 +8,6 @@ import java.io.StringReader;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
@ -27,7 +24,11 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.dump.oaf.Funder; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.community.Funder;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2; import scala.Tuple2;
@ -71,37 +72,43 @@ public class SparkPrepareResultProject implements Serializable {
Dataset<Relation> relation = Utils Dataset<Relation> relation = Utils
.readPath(spark, inputPath + "/relation", Relation.class) .readPath(spark, inputPath + "/relation", Relation.class)
.filter("dataInfo.deletedbyinference = false and relClass = 'produces'"); .filter("dataInfo.deletedbyinference = false and relClass = 'produces'");
Dataset<eu.dnetlib.dhp.schema.oaf.Project> projects = Utils.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.schema.oaf.Project.class); Dataset<eu.dnetlib.dhp.schema.oaf.Project> projects = Utils
.readPath(spark, inputPath + "/project", eu.dnetlib.dhp.schema.oaf.Project.class);
projects projects
.joinWith(relation, projects.col("id").equalTo(relation.col("source"))) .joinWith(relation, projects.col("id").equalTo(relation.col("source")))
.groupByKey( .groupByKey(
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, String>) value -> value._2().getTarget(), Encoders.STRING()) (MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, String>) value -> value
.mapGroups((MapGroupsFunction<String, Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, ResultProject>) (s, it) -> { ._2()
Set<String> projectSet = new HashSet<>(); .getTarget(),
Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next(); Encoders.STRING())
ResultProject rp = new ResultProject(); .mapGroups(
rp.setResultId(first._2().getTarget()); (MapGroupsFunction<String, Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation>, ResultProject>) (s,
eu.dnetlib.dhp.schema.oaf.Project p = first._1(); it) -> {
projectSet.add(p.getId()); Set<String> projectSet = new HashSet<>();
Project ps = getProject(p); Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next();
ResultProject rp = new ResultProject();
rp.setResultId(first._2().getTarget());
eu.dnetlib.dhp.schema.oaf.Project p = first._1();
projectSet.add(p.getId());
Project ps = getProject(p);
List<Project> projList = new ArrayList<>(); List<Project> projList = new ArrayList<>();
projList.add(ps); projList.add(ps);
rp.setProjectsList(projList); rp.setProjectsList(projList);
it.forEachRemaining(c -> { it.forEachRemaining(c -> {
eu.dnetlib.dhp.schema.oaf.Project op = c._1(); eu.dnetlib.dhp.schema.oaf.Project op = c._1();
if (!projectSet.contains(op.getId())) { if (!projectSet.contains(op.getId())) {
projList projList
.add(getProject(op)); .add(getProject(op));
projectSet.add(op.getId()); projectSet.add(op.getId());
} }
}); });
return rp; return rp;
}, Encoders.bean(ResultProject.class)) }, Encoders.bean(ResultProject.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
@ -109,31 +116,31 @@ public class SparkPrepareResultProject implements Serializable {
} }
private static Project getProject(eu.dnetlib.dhp.schema.oaf.Project op) { private static Project getProject(eu.dnetlib.dhp.schema.oaf.Project op) {
Project p = Project Project p = Project
.newInstance( .newInstance(
op.getId(), op.getId(),
op.getCode().getValue(), op.getCode().getValue(),
Optional Optional
.ofNullable(op.getAcronym()) .ofNullable(op.getAcronym())
.map(a -> a.getValue()) .map(a -> a.getValue())
.orElse(null), .orElse(null),
Optional Optional
.ofNullable(op.getTitle()) .ofNullable(op.getTitle())
.map(v -> v.getValue()) .map(v -> v.getValue())
.orElse(null), .orElse(null),
Optional Optional
.ofNullable(op.getFundingtree()) .ofNullable(op.getFundingtree())
.map( .map(
value -> value value -> value
.stream() .stream()
.map(ft -> getFunder(ft.getValue())) .map(ft -> getFunder(ft.getValue()))
.collect(Collectors.toList()) .collect(Collectors.toList())
.get(0)) .get(0))
.orElse(null)); .orElse(null));
Optional<DataInfo> di = Optional.ofNullable(op.getDataInfo()); Optional<DataInfo> di = Optional.ofNullable(op.getDataInfo());
Provenance provenance = new Provenance(); Provenance provenance = new Provenance();
if(di.isPresent()){ if (di.isPresent()) {
provenance.setProvenance(di.get().getProvenanceaction().getClassname()); provenance.setProvenance(di.get().getProvenanceaction().getClassname());
provenance.setTrust(di.get().getTrust()); provenance.setTrust(di.get().getTrust());
p.setProvenance(provenance); p.setProvenance(provenance);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump.community;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -8,8 +8,6 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
@ -20,10 +18,9 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.dump.oaf.Result; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import scala.Tuple2;
public class SparkUpdateProjectInfo implements Serializable { public class SparkUpdateProjectInfo implements Serializable {
@ -70,22 +67,20 @@ public class SparkUpdateProjectInfo implements Serializable {
SparkSession spark, SparkSession spark,
String inputPath, String inputPath,
String outputPath, String outputPath,
String preparedInfoPath) {// , String preparedInfoPath) {
// Class<R> inputClazz) { Dataset<CommunityResult> result = Utils.readPath(spark, inputPath, CommunityResult.class);
Dataset<Result> result = Utils.readPath(spark, inputPath, Result.class);
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class); Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
result result
.joinWith( .joinWith(
resultProject, result.col("id").equalTo(resultProject.col("resultId")), resultProject, result.col("id").equalTo(resultProject.col("resultId")),
"left") "left")
.map(value -> { .map(value -> {
Result r = value._1(); CommunityResult r = value._1();
Optional.ofNullable(value._2()).ifPresent(rp -> { Optional.ofNullable(value._2()).ifPresent(rp -> {
r.setProjects(rp.getProjectsList()); r.setProjects(rp.getProjectsList());
}); });
return r; return r;
}, Encoders.bean(Result.class)) }, Encoders.bean(CommunityResult.class))
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.mode(SaveMode.Append) .mode(SaveMode.Append)