forked from D-Net/dnet-hadoop
WIP: promote job functions implementation snapshot
This commit is contained in:
parent
cc63cdc9e6
commit
2e996d610f
|
@ -1,45 +1,134 @@
|
|||
package eu.dnetlib.dhp.actionmanager;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.ReduceFunction;
|
||||
import org.apache.spark.sql.Column;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
|
||||
public class PromoteActionSetFromHDFSFunctions {
|
||||
|
||||
public static <T extends OafEntity> Dataset<T> groupEntitiesByIdAndMerge(Dataset<T> entityDS,
|
||||
Class<T> clazz) {
|
||||
return entityDS
|
||||
.groupByKey((MapFunction<T, String>) OafEntity::getId, Encoders.STRING())
|
||||
.reduceGroups((ReduceFunction<T>) (x1, x2) -> {
|
||||
x1.mergeFrom(x2);
|
||||
return x1;
|
||||
})
|
||||
.map((MapFunction<Tuple2<String, T>, T>) pair -> pair._2, Encoders.bean(clazz));
|
||||
public static <T extends Oaf> Dataset<T> joinOafEntityWithActionPayloadAndMerge(Dataset<T> oafDS,
|
||||
Dataset<String> actionPayloadDS,
|
||||
SerializableSupplier<Function<T, String>> oafIdFn,
|
||||
SerializableSupplier<BiFunction<String, Class<T>, T>> actionPayloadToOafFn,
|
||||
SerializableSupplier<BiFunction<T, T, T>> mergeAndGetFn,
|
||||
Class<T> clazz) {
|
||||
Dataset<Tuple2<String, T>> oafWithIdDS = oafDS
|
||||
.map((MapFunction<T, Tuple2<String, T>>) value -> new Tuple2<>(oafIdFn.get().apply(value), value),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||
|
||||
Dataset<Tuple2<String, T>> actionPayloadWithIdDS = actionPayloadDS
|
||||
.map((MapFunction<String, T>) value -> actionPayloadToOafFn.get().apply(value, clazz), Encoders.kryo(clazz))
|
||||
.filter((FilterFunction<T>) Objects::nonNull)
|
||||
.map((MapFunction<T, Tuple2<String, T>>) value -> new Tuple2<>(oafIdFn.get().apply(value), value),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||
|
||||
return oafWithIdDS
|
||||
.joinWith(actionPayloadWithIdDS, oafWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), "left_outer")
|
||||
.map((MapFunction<Tuple2<Tuple2<String, T>, Tuple2<String, T>>, T>) value -> {
|
||||
T left = value._1()._2();
|
||||
return Optional
|
||||
.ofNullable(value._2())
|
||||
.map(Tuple2::_2)
|
||||
.map(x -> mergeAndGetFn.get().apply(left, x))
|
||||
.orElse(left);
|
||||
}, Encoders.kryo(clazz));
|
||||
}
|
||||
|
||||
public static <T extends OafEntity, S> Dataset<T> joinEntitiesWithActionPayloadAndMerge(Dataset<T> entityDS,
|
||||
Dataset<S> actionPayloadDS,
|
||||
BiFunction<Dataset<T>, Dataset<S>, Column> entityToActionPayloadJoinExpr,
|
||||
BiFunction<S, Class<T>, T> actionPayloadToEntityFn,
|
||||
Class<T> clazz) {
|
||||
return entityDS
|
||||
.joinWith(actionPayloadDS, entityToActionPayloadJoinExpr.apply(entityDS, actionPayloadDS), "left_outer")
|
||||
.map((MapFunction<Tuple2<T, S>, T>) pair -> Optional
|
||||
.ofNullable(pair._2())
|
||||
.map(x -> {
|
||||
T entity = actionPayloadToEntityFn.apply(x, clazz);
|
||||
pair._1().mergeFrom(entity);
|
||||
return pair._1();
|
||||
})
|
||||
.orElse(pair._1()), Encoders.bean(clazz));
|
||||
public static <T extends Oaf> Dataset<T> groupOafByIdAndMerge(Dataset<T> oafDS,
|
||||
SerializableSupplier<Function<T, String>> oafIdFn,
|
||||
SerializableSupplier<BiFunction<T, T, T>> mergeAndGetFn,
|
||||
Class<T> clazz) {
|
||||
return oafDS
|
||||
.groupByKey((MapFunction<T, String>) x -> oafIdFn.get().apply(x), Encoders.STRING())
|
||||
.reduceGroups((ReduceFunction<T>) (v1, v2) -> mergeAndGetFn.get().apply(v1, v2))
|
||||
.map((MapFunction<Tuple2<String, T>, T>) Tuple2::_2, Encoders.kryo(clazz));
|
||||
}
|
||||
|
||||
public static <T extends Oaf> Dataset<T> groupOafByIdAndMergeUsingAggregator(Dataset<T> oafDS,
|
||||
SerializableSupplier<T> zeroFn,
|
||||
SerializableSupplier<Function<T, String>> idFn,
|
||||
Class<T> clazz) {
|
||||
TypedColumn<T, T> aggregator = new OafAggregator<>(zeroFn, clazz).toColumn();
|
||||
return oafDS
|
||||
.groupByKey((MapFunction<T, String>) x -> idFn.get().apply(x), Encoders.STRING())
|
||||
.agg(aggregator)
|
||||
.map((MapFunction<Tuple2<String, T>, T>) Tuple2::_2, Encoders.kryo(clazz));
|
||||
}
|
||||
|
||||
}
|
||||
public static class OafAggregator<T extends Oaf> extends Aggregator<T, T, T> {
|
||||
private SerializableSupplier<T> zero;
|
||||
private Class<T> clazz;
|
||||
|
||||
public OafAggregator(SerializableSupplier<T> zero, Class<T> clazz) {
|
||||
this.zero = zero;
|
||||
this.clazz = clazz;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T zero() {
|
||||
return zero.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T reduce(T b, T a) {
|
||||
return mergeFrom(b, a);
|
||||
}
|
||||
|
||||
@Override
|
||||
public T merge(T b1, T b2) {
|
||||
return mergeFrom(b1, b2);
|
||||
}
|
||||
|
||||
private T mergeFrom(T left, T right) {
|
||||
if (isNonNull(left)) {
|
||||
if (left instanceof Relation) {
|
||||
((Relation) left).mergeFrom((Relation) right);
|
||||
return left;
|
||||
}
|
||||
((OafEntity) left).mergeFrom((OafEntity) right);
|
||||
return left;
|
||||
}
|
||||
|
||||
if (right instanceof Relation) {
|
||||
((Relation) right).mergeFrom((Relation) left);
|
||||
return right;
|
||||
}
|
||||
((OafEntity) right).mergeFrom((OafEntity) left);
|
||||
return right;
|
||||
}
|
||||
|
||||
private Boolean isNonNull(T a) {
|
||||
return Objects.nonNull(a.getLastupdatetimestamp());
|
||||
}
|
||||
|
||||
@Override
|
||||
public T finish(T reduction) {
|
||||
return reduction;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<T> bufferEncoder() {
|
||||
return Encoders.kryo(clazz);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<T> outputEncoder() {
|
||||
return Encoders.kryo(clazz);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue