From 78de49bb3a11bee3c510a70704c30c5fec2b4e2d Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 9 Sep 2024 15:23:39 +0200 Subject: [PATCH] Update update_records.py --- update_records.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/update_records.py b/update_records.py index f65e5a0..21c35b3 100644 --- a/update_records.py +++ b/update_records.py @@ -2,15 +2,13 @@ import json import os from pyspark.sql import SparkSession from affro_cluster import * +import sys -folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2' -hdfs_output_path = '/tmp/affro/results' -#folder_path = 'check' - -json_file_names = [] +folder_path = sys.argv[1] +hdfs_output_path = sys.argv[2] # Initialize Spark session -spark = SparkSession.builder.appName("JSONProcessing").getOrCreate() +spark = SparkSession.builder.appName("AffRo").getOrCreate() def remove_duplicates(list_of_dicts): # Use a set to store tuples of dictionary items to filter out duplicates