from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import StringType import sys from affro_cluster import * # Initialize SparkSession spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate() # Register the function as a UDF affro_udf = udf(affro, StringType()) # Input list of strings input_data = ["university of athens", "university of vienna", "UCLA"] # # Convert the list to a Spark DataFrame df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string") # # Apply your custom UDF to the DataFrame df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"])) df_with_custom_value.show(truncate=False) # Stop the SparkSession spark.stop()