Update update_records.py
This commit is contained in:
parent
eae8412a16
commit
78de49bb3a
|
@ -2,15 +2,13 @@ import json
|
|||
import os
|
||||
from pyspark.sql import SparkSession
|
||||
from affro_cluster import *
|
||||
import sys
|
||||
|
||||
folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
|
||||
hdfs_output_path = '/tmp/affro/results'
|
||||
#folder_path = 'check'
|
||||
|
||||
json_file_names = []
|
||||
folder_path = sys.argv[1]
|
||||
hdfs_output_path = sys.argv[2]
|
||||
|
||||
# Initialize Spark session
|
||||
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
||||
spark = SparkSession.builder.appName("AffRo").getOrCreate()
|
||||
|
||||
def remove_duplicates(list_of_dicts):
|
||||
# Use a set to store tuples of dictionary items to filter out duplicates
|
||||
|
|
Loading…
Reference in New Issue