affRo/affro_test_example.py

29 lines
768 B
Python

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import sys
from affro_cluster import *
# Initialize SparkSession
spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
# Register the function as a UDF
affro_udf = udf(affro, StringType())
# Input list of strings
input_data = ["university of athens", "university of vienna", "UCLA"]
# # Convert the list to a Spark DataFrame
df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
# # Apply your custom UDF to the DataFrame
df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
df_with_custom_value.show(truncate=False)
# Stop the SparkSession
spark.stop()