diff --git a/strings.py b/strings.py index 250d659..dea9c43 100644 --- a/strings.py +++ b/strings.py @@ -69,7 +69,7 @@ exploded = spark.read.json(folder_path) \ col("rors").alias("OAlex"), explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better ) -affs = explode \ +affs = exploded \ .select("affiliation") \ .distinct() \ .withColumn("Matchings", oalex_affro_udf(col("affiliation")))