1
0
Fork 0

code formatting

This commit is contained in:
Claudio Atzori 2020-07-16 19:06:56 +02:00
parent db8b90a156
commit 1781609508
1 changed files with 94 additions and 90 deletions

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.oa.graph.merge; package eu.dnetlib.dhp.oa.graph.merge;
import com.fasterxml.jackson.databind.ObjectMapper; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import java.util.Objects;
import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob; import java.util.Optional;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -16,13 +16,16 @@ import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2; import scala.Tuple2;
import java.util.Objects;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/** /**
* Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids * Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids
* are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined * are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined
@ -30,101 +33,102 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
*/ */
public class MergeGraphSparkJob { public class MergeGraphSparkJob {
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class); private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
CleanGraphSparkJob.class CleanGraphSparkJob.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json")); "/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String betaInputPath = parser.get("betaInputPath"); String betaInputPath = parser.get("betaInputPath");
log.info("betaInputPath: {}", betaInputPath); log.info("betaInputPath: {}", betaInputPath);
String prodInputPath = parser.get("prodInputPath"); String prodInputPath = parser.get("prodInputPath");
log.info("prodInputPath: {}", prodInputPath); log.info("prodInputPath: {}", prodInputPath);
String outputPath = parser.get("outputPath"); String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
String graphTableClassName = parser.get("graphTableClassName"); String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName); log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName); Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ModelSupport.getOafModelClasses()); conf.registerKryoClasses(ModelSupport.getOafModelClasses());
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, outputPath);
mergeGraphTable(spark, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath); mergeGraphTable(spark, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath);
}); });
} }
private static <P extends Oaf, B extends Oaf> void mergeGraphTable( private static <P extends Oaf, B extends Oaf> void mergeGraphTable(
SparkSession spark, SparkSession spark,
String betaInputPath, String betaInputPath,
String prodInputPath, String prodInputPath,
Class<P> p_clazz, Class<P> p_clazz,
Class<B> b_clazz, Class<B> b_clazz,
String outputPath) { String outputPath) {
Dataset<Tuple2<String, B>> beta = readTableFromPath(spark, betaInputPath, b_clazz); Dataset<Tuple2<String, B>> beta = readTableFromPath(spark, betaInputPath, b_clazz);
Dataset<Tuple2<String, P>> prod = readTableFromPath(spark, prodInputPath, p_clazz); Dataset<Tuple2<String, P>> prod = readTableFromPath(spark, prodInputPath, p_clazz);
prod.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer") prod
.map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> { .joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer")
Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2); .map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> {
Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2); Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2);
if (p.isPresent() & !b.isPresent()) { Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2);
return p.get(); if (p.isPresent() & !b.isPresent()) {
} return p.get();
if (b.isPresent()) { }
return (P) b.get(); if (b.isPresent()) {
} return (P) b.get();
return null; }
}, Encoders.bean(p_clazz)) return null;
.filter((FilterFunction<P>) Objects::nonNull) }, Encoders.bean(p_clazz))
.write() .filter((FilterFunction<P>) Objects::nonNull)
.mode(SaveMode.Overwrite) .write()
.option("compression", "gzip") .mode(SaveMode.Overwrite)
.json(outputPath); .option("compression", "gzip")
} .json(outputPath);
}
private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableFromPath( private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableFromPath(
SparkSession spark, String inputEntityPath, Class<T> clazz) { SparkSession spark, String inputEntityPath, Class<T> clazz) {
log.info("Reading Graph table from: {}", inputEntityPath); log.info("Reading Graph table from: {}", inputEntityPath);
return spark return spark
.read() .read()
.textFile(inputEntityPath) .textFile(inputEntityPath)
.map( .map(
(MapFunction<String, Tuple2<String, T>>) value -> { (MapFunction<String, Tuple2<String, T>>) value -> {
final T t = OBJECT_MAPPER.readValue(value, clazz); final T t = OBJECT_MAPPER.readValue(value, clazz);
final String id = ModelSupport.idFn().apply(t); final String id = ModelSupport.idFn().apply(t);
return new Tuple2<>(id, t); return new Tuple2<>(id, t);
}, },
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
} }
private static void removeOutputDir(SparkSession spark, String path) { private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
} }
} }