From d5c39a10596f732d9a17fdb9d6c5abe014f88c4c Mon Sep 17 00:00:00 2001 From: ikanellos Date: Thu, 6 Jul 2023 15:04:48 +0300 Subject: [PATCH] Fix map scores to doi --- .../oa/graph/impact_indicators/job.properties | 2 +- .../oozie_app/map_scores_to_dois.py | 28 ++++++++++++------- 2 files changed, 19 insertions(+), 11 deletions(-) mode change 100644 => 100755 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties index a2f3d5828..ea68ade1a 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties @@ -47,7 +47,7 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster # current year used when creating graph / by some ranking methods -currentYear=2024 +currentYear=2023 # Alpha value for pagerank pageRankAlpha=0.5 diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py old mode 100644 new mode 100755 index 0d294e045..0fc67eb53 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py @@ -1,3 +1,4 @@ +#!/usr/bin/python # This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow # and uses this mapping to create doi-based score files in the format required by BiP! DB. # This is done by reading each openaire-id based ranking file and joining the openaire based @@ -17,28 +18,35 @@ import pyspark.sql.functions as F # from pyspark.sql.functions import udf ################################################################################################# ################################################################################################# -# Clean up directory name +# Clean up directory name - no longer needed in final workflow version +''' def clean_directory_name(dir_name): # We have a name with the form *_bip_universe_* or *_graph_universe_* # and we need to keep the parts in * + + dir_name_parts = dir_name.split('_') dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)] - - clean_name = '_'.join(dir_name_parts) + + dir_name = dir_name.replace("openaire_id_graph", "openaire_ids") + clean_name = dir_name + ".txt.gz" - if '_ids' not in clean_name: - clean_name = clean_name.replace('id_', 'ids_') + # clean_name = '_'.join(dir_name_parts) + + # if '_ids' not in clean_name: + # clean_name = clean_name.replace('id_', 'ids_') # clean_name = clean_name.replace('.txt', '') # clean_name = clean_name.replace('.gz', '') - if 'openaire_ids_' in clean_name: - clean_name = clean_name.replace('openaire_ids_', '') + # if 'openaire_ids_' in clean_name: + # clean_name = clean_name.replace('openaire_ids_', '') # clean_name = clean_name + '.txt.gz' # else: # clean_name = clean_name + '.txt.gz' return clean_name +''' ################################################################################################# if len(sys.argv) < 3: print ("Usage: ./map_scores_to_dois.py <...etc...>") @@ -47,12 +55,12 @@ if len(sys.argv) < 3: # Read arguments synonyms_folder = sys.argv[1] num_partitions = int(sys.argv[2]) -input_file_list = [argument for argument in sys.argv[3:]] -input_file_list = [clean_directory_name(item) for item in input_file_list] +input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]] +# input_file_list = [clean_directory_name(item) for item in input_file_list] # Prepare output specific variables output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list] -output_file_list = [item + ".gz" if not item.endswith(".gz") else item for item in output_file_list] +output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list] # --- INFO MESSAGES --- # print ("\n\n----------------------------")