Added input/output path as parameters
This commit is contained in:
parent
5568aa92ec
commit
0500fc586f
|
@ -3,9 +3,10 @@ import os
|
|||
from pyspark.sql import SparkSession
|
||||
from affro_cluster import *
|
||||
|
||||
folder_path ='/Users/myrto/Documents/openAIRE/7. matching/data_samples/iis_short'
|
||||
import sys
|
||||
|
||||
#folder_path = 'check'
|
||||
folder_path = sys.argv[1]
|
||||
hdfs_output_path = sys.argv[2]
|
||||
|
||||
json_file_names = []
|
||||
|
||||
|
@ -106,8 +107,6 @@ def update_record(record):
|
|||
print(f"Error processing record with id {record.get('id')}: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
for file in json_file_names:
|
||||
print('start processing '+str(file))
|
||||
df = spark.read.json(folder_path + '/' + file)
|
||||
|
@ -122,7 +121,7 @@ for file in json_file_names:
|
|||
json_data = json_rdd.collect()
|
||||
|
||||
# Create a new filename by appending "_output.json" to the original filename (without extension)
|
||||
output_file_name = file+'_output.json'
|
||||
output_file_name = hdfs_output_path + "/" + file+'_output.json'
|
||||
print('end processing '+str(file))
|
||||
|
||||
with open(output_file_name, 'w') as f:
|
||||
|
|
Loading…
Reference in New Issue