diff --git a/update_records.py b/update_records.py index ad122d1..6c2891a 100644 --- a/update_records.py +++ b/update_records.py @@ -3,9 +3,10 @@ import os from pyspark.sql import SparkSession from affro_cluster import * -folder_path ='/Users/myrto/Documents/openAIRE/7. matching/data_samples/iis_short' +import sys -#folder_path = 'check' +folder_path = sys.argv[1] +hdfs_output_path = sys.argv[2] json_file_names = [] @@ -106,8 +107,6 @@ def update_record(record): print(f"Error processing record with id {record.get('id')}: {str(e)}") return None - - for file in json_file_names: print('start processing '+str(file)) df = spark.read.json(folder_path + '/' + file) @@ -122,7 +121,7 @@ for file in json_file_names: json_data = json_rdd.collect() # Create a new filename by appending "_output.json" to the original filename (without extension) - output_file_name = file+'_output.json' + output_file_name = hdfs_output_path + "/" + file+'_output.json' print('end processing '+str(file)) with open(output_file_name, 'w') as f: