Fix selection of columns in graph creation

This commit is contained in:
Ilias Kanellos 2023-05-16 17:32:53 +03:00
parent 12a57e1f58
commit 3c38f7ba6f
1 changed files with 2 additions and 2 deletions

View File

@ -132,8 +132,8 @@ cites_df = spark.read.json(graph_folder + "/relation")\
& (F.col('dataInfo.invisible') == "false"))\ & (F.col('dataInfo.invisible') == "false"))\
.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\ .drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
.repartition(num_partitions, 'citing').drop('relClass')\ .repartition(num_partitions, 'citing').drop('relClass')\
.withColumn('collected_lower', F.expr('transform(collectedfrom.value, x -> lower(x))'))\ .withColumn('collected_lower', F.expr('transform(value, x -> lower(x))'))\
.drop('collectedfrom.value')\ .drop('value')\
.where( .where(
(F.array_contains(F.col('collected_lower'), "opencitations")) (F.array_contains(F.col('collected_lower'), "opencitations"))
| (F.array_contains(F.col('collected_lower'), "crossref")) | (F.array_contains(F.col('collected_lower'), "crossref"))