Added input/output path as parameters

This commit is contained in:
Miriam Baglioni 2024-12-04 15:14:58 +01:00
parent 5568aa92ec
commit 0500fc586f
1 changed files with 4 additions and 5 deletions

View File

@ -3,9 +3,10 @@ import os
from pyspark.sql import SparkSession
from affro_cluster import *
folder_path ='/Users/myrto/Documents/openAIRE/7. matching/data_samples/iis_short'
import sys
#folder_path = 'check'
folder_path = sys.argv[1]
hdfs_output_path = sys.argv[2]
json_file_names = []
@ -106,8 +107,6 @@ def update_record(record):
print(f"Error processing record with id {record.get('id')}: {str(e)}")
return None
for file in json_file_names:
print('start processing '+str(file))
df = spark.read.json(folder_path + '/' + file)
@ -122,7 +121,7 @@ for file in json_file_names:
json_data = json_rdd.collect()
# Create a new filename by appending "_output.json" to the original filename (without extension)
output_file_name = file+'_output.json'
output_file_name = hdfs_output_path + "/" + file+'_output.json'
print('end processing '+str(file))
with open(output_file_name, 'w') as f: