forked from D-Net/dnet-hadoop
added teh logic to dump also the products for the whole graph. They will miss collected from and context information that will be materialized as new relations
This commit is contained in:
parent
00f2b8410a
commit
968c59d97a
|
@ -1,17 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
@ -20,11 +10,24 @@ import java.util.Optional;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class DumpProducts implements Serializable {
|
||||
|
||||
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, CommunityMap communityMap, Class<? extends OafEntity> inputClazz, boolean graph) {
|
||||
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, CommunityMap communityMap,
|
||||
Class<? extends OafEntity> inputClazz,
|
||||
Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> outputClazz,
|
||||
boolean graph) {
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
|
@ -33,22 +36,23 @@ public class DumpProducts implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
execDump(spark, inputPath, outputPath, communityMap, inputClazz, graph);// , dumpClazz);
|
||||
execDump(spark, inputPath, outputPath, communityMap, inputClazz, outputClazz, graph);// , dumpClazz);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
public static <I extends OafEntity, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> void execDump(SparkSession spark,
|
||||
public static <I extends OafEntity, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> void execDump(
|
||||
SparkSession spark,
|
||||
String inputPath,
|
||||
String outputPath,
|
||||
CommunityMap communityMap,
|
||||
Class<I> inputClazz,
|
||||
boolean graph) {
|
||||
Class<O> outputClazz,
|
||||
boolean graph) throws ClassNotFoundException {
|
||||
|
||||
Dataset<I> tmp = Utils.readPath(spark, inputPath, inputClazz);
|
||||
|
||||
tmp
|
||||
.map(value -> execMap(value, communityMap, graph), Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.Result.class))
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map(value -> execMap(value, communityMap, graph), Encoders.bean(outputClazz))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
@ -57,14 +61,24 @@ public class DumpProducts implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
private static <I extends OafEntity> eu.dnetlib.dhp.schema.dump.oaf.Result execMap(I value,
|
||||
private static <I extends OafEntity, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O execMap(I value,
|
||||
CommunityMap communityMap,
|
||||
boolean graph) {
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
|
||||
if (odInfo.isPresent()) {
|
||||
if (odInfo.get().getDeletedbyinference()) {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!graph) {
|
||||
Set<String> communities = communityMap.keySet();
|
||||
|
||||
Optional<List<Context>> inputContext = Optional.ofNullable(((eu.dnetlib.dhp.schema.oaf.Result)value).getContext());
|
||||
Optional<List<Context>> inputContext = Optional
|
||||
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Result) value).getContext());
|
||||
if (!inputContext.isPresent()) {
|
||||
return null;
|
||||
}
|
||||
|
@ -81,7 +95,7 @@ public class DumpProducts implements Serializable {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
return ResultMapper.map(value, communityMap);
|
||||
return (O) ResultMapper.map(value, communityMap, graph);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue