Fix selection of columns in graph creation
This commit is contained in:
parent
12a57e1f58
commit
3c38f7ba6f
|
@ -132,8 +132,8 @@ cites_df = spark.read.json(graph_folder + "/relation")\
|
||||||
& (F.col('dataInfo.invisible') == "false"))\
|
& (F.col('dataInfo.invisible') == "false"))\
|
||||||
.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
|
.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
|
||||||
.repartition(num_partitions, 'citing').drop('relClass')\
|
.repartition(num_partitions, 'citing').drop('relClass')\
|
||||||
.withColumn('collected_lower', F.expr('transform(collectedfrom.value, x -> lower(x))'))\
|
.withColumn('collected_lower', F.expr('transform(value, x -> lower(x))'))\
|
||||||
.drop('collectedfrom.value')\
|
.drop('value')\
|
||||||
.where(
|
.where(
|
||||||
(F.array_contains(F.col('collected_lower'), "opencitations"))
|
(F.array_contains(F.col('collected_lower'), "opencitations"))
|
||||||
| (F.array_contains(F.col('collected_lower'), "crossref"))
|
| (F.array_contains(F.col('collected_lower'), "crossref"))
|
||||||
|
|
Loading…
Reference in New Issue