Fix map scores to doi

This commit is contained in:
Ilias Kanellos 2023-07-06 15:04:48 +03:00
parent 772d5f0aab
commit d5c39a1059
2 changed files with 19 additions and 11 deletions

View File

@ -47,7 +47,7 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster
# current year used when creating graph / by some ranking methods # current year used when creating graph / by some ranking methods
currentYear=2024 currentYear=2023
# Alpha value for pagerank # Alpha value for pagerank
pageRankAlpha=0.5 pageRankAlpha=0.5

View File

@ -1,3 +1,4 @@
#!/usr/bin/python
# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow # This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
# and uses this mapping to create doi-based score files in the format required by BiP! DB. # and uses this mapping to create doi-based score files in the format required by BiP! DB.
# This is done by reading each openaire-id based ranking file and joining the openaire based # This is done by reading each openaire-id based ranking file and joining the openaire based
@ -17,28 +18,35 @@ import pyspark.sql.functions as F
# from pyspark.sql.functions import udf # from pyspark.sql.functions import udf
################################################################################################# #################################################################################################
################################################################################################# #################################################################################################
# Clean up directory name # Clean up directory name - no longer needed in final workflow version
'''
def clean_directory_name(dir_name): def clean_directory_name(dir_name):
# We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_* # We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_*
# and we need to keep the parts in * # and we need to keep the parts in *
dir_name_parts = dir_name.split('_') dir_name_parts = dir_name.split('_')
dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)] dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
clean_name = '_'.join(dir_name_parts) dir_name = dir_name.replace("openaire_id_graph", "openaire_ids")
clean_name = dir_name + ".txt.gz"
if '_ids' not in clean_name: # clean_name = '_'.join(dir_name_parts)
clean_name = clean_name.replace('id_', 'ids_')
# if '_ids' not in clean_name:
# clean_name = clean_name.replace('id_', 'ids_')
# clean_name = clean_name.replace('.txt', '') # clean_name = clean_name.replace('.txt', '')
# clean_name = clean_name.replace('.gz', '') # clean_name = clean_name.replace('.gz', '')
if 'openaire_ids_' in clean_name: # if 'openaire_ids_' in clean_name:
clean_name = clean_name.replace('openaire_ids_', '') # clean_name = clean_name.replace('openaire_ids_', '')
# clean_name = clean_name + '.txt.gz' # clean_name = clean_name + '.txt.gz'
# else: # else:
# clean_name = clean_name + '.txt.gz' # clean_name = clean_name + '.txt.gz'
return clean_name return clean_name
'''
################################################################################################# #################################################################################################
if len(sys.argv) < 3: if len(sys.argv) < 3:
print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>") print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
@ -47,12 +55,12 @@ if len(sys.argv) < 3:
# Read arguments # Read arguments
synonyms_folder = sys.argv[1] synonyms_folder = sys.argv[1]
num_partitions = int(sys.argv[2]) num_partitions = int(sys.argv[2])
input_file_list = [argument for argument in sys.argv[3:]] input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]]
input_file_list = [clean_directory_name(item) for item in input_file_list] # input_file_list = [clean_directory_name(item) for item in input_file_list]
# Prepare output specific variables # Prepare output specific variables
output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list] output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
output_file_list = [item + ".gz" if not item.endswith(".gz") else item for item in output_file_list] output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list]
# --- INFO MESSAGES --- # # --- INFO MESSAGES --- #
print ("\n\n----------------------------") print ("\n\n----------------------------")