forked from antonis.lempesis/dnet-hadoop
integrated filter applied when merging BETA & PROD graphs to rule our records from Datacite
This commit is contained in:
parent
3256b9c836
commit
a4e82a65aa
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.merge;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import javax.xml.crypto.Data;
|
import javax.xml.crypto.Data;
|
||||||
|
|
||||||
|
@ -127,6 +128,13 @@ public class MergeGraphTableSparkJob {
|
||||||
}
|
}
|
||||||
}, Encoders.bean(p_clazz))
|
}, Encoders.bean(p_clazz))
|
||||||
.filter((FilterFunction<P>) Objects::nonNull)
|
.filter((FilterFunction<P>) Objects::nonNull)
|
||||||
|
.filter((FilterFunction<P>) o -> {
|
||||||
|
HashSet<String> collectedFromNames = Optional
|
||||||
|
.ofNullable(o.getCollectedfrom())
|
||||||
|
.map(c -> c.stream().map(KeyValue::getValue).collect(Collectors.toCollection(HashSet::new)))
|
||||||
|
.orElse(new HashSet<String>());
|
||||||
|
return !collectedFromNames.contains("Datacite");
|
||||||
|
})
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
|
Loading…
Reference in New Issue