integrated filter applied when merging BETA & PROD graphs to rule our records from Datacite

This commit is contained in:
Claudio Atzori 2021-03-19 11:34:44 +01:00
parent 3256b9c836
commit a4e82a65aa
1 changed files with 60 additions and 52 deletions

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import javax.xml.crypto.Data; import javax.xml.crypto.Data;
@ -127,6 +128,13 @@ public class MergeGraphTableSparkJob {
} }
}, Encoders.bean(p_clazz)) }, Encoders.bean(p_clazz))
.filter((FilterFunction<P>) Objects::nonNull) .filter((FilterFunction<P>) Objects::nonNull)
.filter((FilterFunction<P>) o -> {
HashSet<String> collectedFromNames = Optional
.ofNullable(o.getCollectedfrom())
.map(c -> c.stream().map(KeyValue::getValue).collect(Collectors.toCollection(HashSet::new)))
.orElse(new HashSet<String>());
return !collectedFromNames.contains("Datacite");
})
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")