forked from D-Net/dnet-hadoop
[raw_all] The claim merge procedure includes the claimed contexts in the merged result
This commit is contained in:
parent
fdcff42e46
commit
b7b8e0986e
|
@ -3,8 +3,12 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -98,14 +102,9 @@ public class MergeClaimsApplication {
|
||||||
raw
|
raw
|
||||||
.joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer")
|
.joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer")
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Tuple2<String, T>, Tuple2<String, T>>, T>) value -> {
|
(MapFunction<Tuple2<Tuple2<String, T>, Tuple2<String, T>>, T>) value -> processClaims(
|
||||||
Optional<Tuple2<String, T>> opRaw = Optional.ofNullable(value._1());
|
Optional.ofNullable(value._1()),
|
||||||
Optional<Tuple2<String, T>> opClaim = Optional.ofNullable(value._2());
|
Optional.ofNullable(value._2())),
|
||||||
|
|
||||||
return opRaw.isPresent()
|
|
||||||
? opRaw.get()._2()
|
|
||||||
: opClaim.isPresent() ? opClaim.get()._2() : null;
|
|
||||||
},
|
|
||||||
Encoders.bean(clazz))
|
Encoders.bean(clazz))
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.map(
|
.map(
|
||||||
|
@ -117,6 +116,78 @@ public class MergeClaimsApplication {
|
||||||
.text(outPath);
|
.text(outPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> T processClaims(Optional<Tuple2<String, T>> opRaw,
|
||||||
|
Optional<Tuple2<String, T>> opClaim) {
|
||||||
|
|
||||||
|
// when both are present
|
||||||
|
if (opClaim.isPresent() && opRaw.isPresent()) {
|
||||||
|
T oafClaim = opClaim.get()._2();
|
||||||
|
if (oafClaim instanceof Result) {
|
||||||
|
T oafRaw = opRaw.get()._2();
|
||||||
|
|
||||||
|
// merge the context lists from both oaf objects ...
|
||||||
|
final List<Context> context = mergeContexts((Result) oafClaim, (Result) oafRaw);
|
||||||
|
|
||||||
|
// ... and set it on the result from the aggregator
|
||||||
|
((Result) oafRaw).setContext(context);
|
||||||
|
return oafRaw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise prefer the result from the aggregator
|
||||||
|
return opRaw.isPresent()
|
||||||
|
? opRaw.get()._2()
|
||||||
|
: opClaim.map(Tuple2::_2).orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Context> mergeContexts(Result oafClaim, Result oafRaw) {
|
||||||
|
return new ArrayList<>(
|
||||||
|
Stream
|
||||||
|
.concat(
|
||||||
|
Optional
|
||||||
|
.ofNullable(oafClaim.getContext())
|
||||||
|
.map(List::stream)
|
||||||
|
.orElse(Stream.empty()),
|
||||||
|
Optional
|
||||||
|
.ofNullable(oafRaw.getContext())
|
||||||
|
.map(List::stream)
|
||||||
|
.orElse(Stream.empty()))
|
||||||
|
.collect(
|
||||||
|
Collectors
|
||||||
|
.toMap(
|
||||||
|
Context::getId,
|
||||||
|
c -> c,
|
||||||
|
(c1, c2) -> {
|
||||||
|
Context c = new Context();
|
||||||
|
c.setId(c1.getId());
|
||||||
|
c
|
||||||
|
.setDataInfo(
|
||||||
|
new ArrayList<>(
|
||||||
|
Stream
|
||||||
|
.concat(
|
||||||
|
Optional
|
||||||
|
.ofNullable(c1.getDataInfo())
|
||||||
|
.map(List::stream)
|
||||||
|
.orElse(Stream.empty()),
|
||||||
|
Optional
|
||||||
|
.ofNullable(c2.getDataInfo())
|
||||||
|
.map(List::stream)
|
||||||
|
.orElse(Stream.empty()))
|
||||||
|
.collect(
|
||||||
|
Collectors
|
||||||
|
.toMap(
|
||||||
|
d -> Optional
|
||||||
|
.ofNullable(d.getProvenanceaction())
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.orElse(""),
|
||||||
|
d -> d,
|
||||||
|
(d1, d2) -> d1))
|
||||||
|
.values()));
|
||||||
|
return c;
|
||||||
|
}))
|
||||||
|
.values());
|
||||||
|
}
|
||||||
|
|
||||||
private static <T extends Oaf> Dataset<T> readFromPath(
|
private static <T extends Oaf> Dataset<T> readFromPath(
|
||||||
SparkSession spark, String path, Class<T> clazz) {
|
SparkSession spark, String path, Class<T> clazz) {
|
||||||
return spark
|
return spark
|
||||||
|
|
Loading…
Reference in New Issue