From 5295d106914ca9bbe4e0b34e9689f5dcd45b0159 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 13 Jul 2021 16:11:46 +0200 Subject: [PATCH] added check not to dump deletedByInference entities --- .../dnetlib/dhp/oa/graph/dump/complete/DumpGraphEntities.java | 3 +++ .../oa/graph/dump/complete/SparkSelectValidRelationsJob.java | 4 +++- .../dump/complete/DumpOrganizationProjectDatasourceTest.java | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpGraphEntities.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpGraphEntities.java index 36ced3a4ab..530f7a003f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpGraphEntities.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpGraphEntities.java @@ -453,6 +453,7 @@ public class DumpGraphEntities implements Serializable { .map( (MapFunction) o -> mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) o), Encoders.bean(Organization.class)) + .filter(Objects::nonNull) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") @@ -460,6 +461,8 @@ public class DumpGraphEntities implements Serializable { } private static Organization mapOrganization(eu.dnetlib.dhp.schema.oaf.Organization org) { + if(org.getDataInfo().getDeletedbyinference()) + return null; Organization organization = new Organization(); Optional diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkSelectValidRelationsJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkSelectValidRelationsJob.java index 8b477b34d6..77402ea1b9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkSelectValidRelationsJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkSelectValidRelationsJob.java @@ -110,9 +110,11 @@ public class SparkSelectValidRelationsJob implements Serializable { "UNION ALL " + "SELECT id " + "FROM project " + + "WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " + "UNION ALL " + "SELECT id " + - "FROM datasource ") + "FROM datasource " + + "WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " ) .createOrReplaceTempView("identifiers"); spark diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpOrganizationProjectDatasourceTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpOrganizationProjectDatasourceTest.java index 69100a114e..89ecdfb2b4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpOrganizationProjectDatasourceTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpOrganizationProjectDatasourceTest.java @@ -88,7 +88,7 @@ public class DumpOrganizationProjectDatasourceTest { org.apache.spark.sql.Dataset verificationDataset = spark .createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Organization.class)); - Assertions.assertEquals(34, verificationDataset.count()); + Assertions.assertEquals(15, verificationDataset.count()); verificationDataset .foreach(