Added input/output path as parameters

2024-12-04 15:14:58 +01:00 · 2024-12-04 15:14:58 +01:00 · 0500fc586f
parent 5568aa92ec
commit 0500fc586f
1 changed files with 4 additions and 5 deletions
--- a/update_records.py
+++ b/update_records.py
@ -3,9 +3,10 @@ import os
 from pyspark.sql import SparkSession
 from affro_cluster import *

-folder_path ='/Users/myrto/Documents/openAIRE/7. matching/data_samples/iis_short'
+import sys

-#folder_path = 'check'
+folder_path = sys.argv[1]
+hdfs_output_path = sys.argv[2]

 json_file_names = []

@ -106,8 +107,6 @@ def update_record(record):
        print(f"Error processing record with id {record.get('id')}: {str(e)}")
        return None

-
-
 for file in json_file_names:
    print('start processing '+str(file))
    df = spark.read.json(folder_path + '/' + file)
@ -122,7 +121,7 @@ for file in json_file_names:
    json_data = json_rdd.collect()

    # Create a new filename by appending "_output.json" to the original filename (without extension)
-    output_file_name = file+'_output.json'
+    output_file_name = hdfs_output_path + "/" + file+'_output.json'
    print('end processing '+str(file))

    with open(output_file_name, 'w') as f: