Update update_records.py

This commit is contained in:
Serafeim Chatzopoulos 2024-09-09 15:23:39 +02:00
parent eae8412a16
commit 78de49bb3a
1 changed files with 4 additions and 6 deletions

View File

@ -2,15 +2,13 @@ import json
import os
from pyspark.sql import SparkSession
from affro_cluster import *
import sys
folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
hdfs_output_path = '/tmp/affro/results'
#folder_path = 'check'
json_file_names = []
folder_path = sys.argv[1]
hdfs_output_path = sys.argv[2]
# Initialize Spark session
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
spark = SparkSession.builder.appName("AffRo").getOrCreate()
def remove_duplicates(list_of_dicts):
# Use a set to store tuples of dictionary items to filter out duplicates