Add a "CleanRelation" action after the PropagateRelation to filter out all relations that have been deleted by inference or that are pointing to dangling entities #328

Merged
miriam.baglioni merged 2 commits from cleanup_relations_after_dedup into beta 2023-08-08 09:49:13 +02:00
1 changed files with 7 additions and 3 deletions
Showing only changes of commit 97b6d1dc45 - Show all commits

View File

@ -46,22 +46,26 @@ class SparkCleanRelation(parser: ArgumentApplicationParser, spark: SparkSession)
val entities = val entities =
Seq("datasource", "project", "organization", "publication", "dataset", "software", "otherresearchproduct") Seq("datasource", "project", "organization", "publication", "dataset", "software", "otherresearchproduct")
val idsSchema = StructType.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>")
val emptyIds = spark.createDataFrame(spark.sparkContext.emptyRDD[Row].setName("empty"), val emptyIds = spark.createDataFrame(spark.sparkContext.emptyRDD[Row].setName("empty"),
new StructType().add(StructField("id", DataTypes.StringType, true))) idsSchema)
val ids = entities val ids = entities
.foldLeft(emptyIds)((ds, entity) => { .foldLeft(emptyIds)((ds, entity) => {
val entityPath = graphBasePath + '/' + entity val entityPath = graphBasePath + '/' + entity
if (HdfsSupport.exists(entityPath, spark.sparkContext.hadoopConfiguration)) { if (HdfsSupport.exists(entityPath, spark.sparkContext.hadoopConfiguration)) {
ds.union(spark.read.schema("`id` STRING").json(entityPath)) ds.union(spark.read.schema(idsSchema).json(entityPath))
} else { } else {
ds ds
} }
}) })
.filter("dataInfo.deletedbyinference != true AND dataInfo.invisible != true")
.select("id")
.distinct() .distinct()
val relations = spark.read.schema(Encoders.bean(classOf[Relation]).schema).json(inputPath) val relations = spark.read.schema(Encoders.bean(classOf[Relation]).schema).json(inputPath)
.filter(col("dataInfo.deletedbyinference").isNull || col("dataInfo.deletedbyinference") === false) .filter("dataInfo.deletedbyinference != true AND dataInfo.invisible != true")
AbstractSparkAction.save( AbstractSparkAction.save(
relations relations