29 lines
768 B
Python
29 lines
768 B
Python
from pyspark.sql import SparkSession
|
|
from pyspark.sql.functions import udf
|
|
from pyspark.sql.types import StringType
|
|
|
|
import sys
|
|
|
|
from affro_cluster import *
|
|
|
|
# Initialize SparkSession
|
|
spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
|
|
|
|
# Register the function as a UDF
|
|
affro_udf = udf(affro, StringType())
|
|
|
|
# Input list of strings
|
|
input_data = ["university of athens", "university of vienna", "UCLA"]
|
|
|
|
# # Convert the list to a Spark DataFrame
|
|
df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
|
|
|
|
# # Apply your custom UDF to the DataFrame
|
|
df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
|
|
|
|
|
|
df_with_custom_value.show(truncate=False)
|
|
|
|
# Stop the SparkSession
|
|
spark.stop()
|