affRo/affro_test_example.py

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import sys

from affro_cluster import *

# Initialize SparkSession
spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()

# Register the function as a UDF
affro_udf = udf(affro, StringType())

# Input list of strings
input_data = ["university of athens", "university of vienna", "UCLA"]

# # Convert the list to a Spark DataFrame
df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")

# # Apply your custom UDF to the DataFrame
df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))


df_with_custom_value.show(truncate=False)

# Stop the SparkSession
spark.stop()