integrated filter applied when merging BETA & PROD graphs to rule our records from Datacite

This commit is contained in:
Claudio Atzori 2021-03-19 11:34:44 +01:00
parent 3256b9c836
commit a4e82a65aa
1 changed files with 60 additions and 52 deletions

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*;
import java.util.stream.Collectors;
import javax.xml.crypto.Data;
@ -127,6 +128,13 @@ public class MergeGraphTableSparkJob {
}
}, Encoders.bean(p_clazz))
.filter((FilterFunction<P>) Objects::nonNull)
.filter((FilterFunction<P>) o -> {
HashSet<String> collectedFromNames = Optional
.ofNullable(o.getCollectedfrom())
.map(c -> c.stream().map(KeyValue::getValue).collect(Collectors.toCollection(HashSet::new)))
.orElse(new HashSet<String>());
return !collectedFromNames.contains("Datacite");
})
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")