Update update_records.py
This commit is contained in:
parent
eae8412a16
commit
78de49bb3a
|
@ -2,15 +2,13 @@ import json
|
||||||
import os
|
import os
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
from affro_cluster import *
|
from affro_cluster import *
|
||||||
|
import sys
|
||||||
|
|
||||||
folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
|
folder_path = sys.argv[1]
|
||||||
hdfs_output_path = '/tmp/affro/results'
|
hdfs_output_path = sys.argv[2]
|
||||||
#folder_path = 'check'
|
|
||||||
|
|
||||||
json_file_names = []
|
|
||||||
|
|
||||||
# Initialize Spark session
|
# Initialize Spark session
|
||||||
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
spark = SparkSession.builder.appName("AffRo").getOrCreate()
|
||||||
|
|
||||||
def remove_duplicates(list_of_dicts):
|
def remove_duplicates(list_of_dicts):
|
||||||
# Use a set to store tuples of dictionary items to filter out duplicates
|
# Use a set to store tuples of dictionary items to filter out duplicates
|
||||||
|
|
Loading…
Reference in New Issue