pre-group the records in each table before joning the contents from BETA and PROD together

This commit is contained in:
Claudio Atzori 2023-03-02 14:49:19 +01:00
parent db9dad4aa7
commit 8ec0d62d91
2 changed files with 69 additions and 17 deletions

View File

@ -11,10 +11,9 @@ import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.expressions.Aggregator;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -25,6 +24,7 @@ import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2; import scala.Tuple2;
/** /**
@ -107,11 +107,11 @@ public class MergeGraphTableSparkJob {
Class<B> b_clazz, Class<B> b_clazz,
String outputPath) { String outputPath) {
Dataset<Tuple2<String, B>> beta = readTableFromPath(spark, betaInputPath, b_clazz); Dataset<Tuple2<String, B>> beta = readTableAndGroupById(spark, betaInputPath, b_clazz);
Dataset<Tuple2<String, P>> prod = readTableFromPath(spark, prodInputPath, p_clazz); Dataset<Tuple2<String, P>> prod = readTableAndGroupById(spark, prodInputPath, p_clazz);
prod prod
.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer") .joinWith(beta, prod.col("value").equalTo(beta.col("value")), "full_outer")
.map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> { .map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> {
Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2); Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2);
Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2); Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2);
@ -126,12 +126,13 @@ public class MergeGraphTableSparkJob {
case "PROD": case "PROD":
return mergeWithPriorityToPROD(p, b); return mergeWithPriorityToPROD(p, b);
} }
}, Encoders.bean(p_clazz)) }, Encoders.kryo(p_clazz))
.filter((FilterFunction<P>) Objects::nonNull) .filter((FilterFunction<P>) Objects::nonNull)
.map((MapFunction<P, String>) OBJECT_MAPPER::writeValueAsString, Encoders.STRING())
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .text(outputPath);
} }
/** /**
@ -212,20 +213,71 @@ public class MergeGraphTableSparkJob {
return null; return null;
} }
private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableFromPath( private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableAndGroupById(
SparkSession spark, String inputEntityPath, Class<T> clazz) { SparkSession spark, String inputEntityPath, Class<T> clazz) {
final TypedColumn<T, T> aggregator = new GroupingAggregator(clazz).toColumn();
log.info("Reading Graph table from: {}", inputEntityPath); log.info("Reading Graph table from: {}", inputEntityPath);
return spark return spark
.read() .read()
.textFile(inputEntityPath) .textFile(inputEntityPath)
.map( .map((MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.kryo(clazz))
(MapFunction<String, Tuple2<String, T>>) value -> { .groupByKey((MapFunction<T, String>) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING())
final T t = OBJECT_MAPPER.readValue(value, clazz); .agg(aggregator);
final String id = ModelSupport.idFn().apply(t); }
return new Tuple2<>(id, t);
}, public static class GroupingAggregator<T extends Oaf> extends Aggregator<T, T, T> {
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
private Class<T> clazz;
public GroupingAggregator(Class<T> clazz) {
this.clazz = clazz;
}
@Override
public T zero() {
return null;
}
@Override
public T reduce(T b, T a) {
return mergeAndGet(b, a);
}
private T mergeAndGet(T b, T a) {
if (Objects.nonNull(a) && Objects.nonNull(b)) {
if (ModelSupport.isSubClass(a, OafEntity.class) && ModelSupport.isSubClass(b, OafEntity.class)) {
return (T) OafMapperUtils.mergeEntities((OafEntity) b, (OafEntity) a);
}
if (a instanceof Relation && b instanceof Relation) {
((Relation) a).mergeFrom(b);
return a;
}
}
return Objects.isNull(a) ? b : a;
}
@Override
public T merge(T b, T a) {
return mergeAndGet(b, a);
}
@Override
public T finish(T j) {
return j;
}
@Override
public Encoder<T> bufferEncoder() {
return Encoders.kryo(clazz);
}
@Override
public Encoder<T> outputEncoder() {
return Encoders.kryo(clazz);
}
} }
private static void removeOutputDir(SparkSession spark, String path) { private static void removeOutputDir(SparkSession spark, String path) {

View File

@ -275,7 +275,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=10000
</spark-opts> </spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/relation</arg> <arg>--betaInputPath</arg><arg>${betaInputGraphPath}/relation</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/relation</arg> <arg>--prodInputPath</arg><arg>${prodInputGraphPath}/relation</arg>