forked from D-Net/dnet-hadoop
Fix map scores to doi
This commit is contained in:
parent
772d5f0aab
commit
d5c39a1059
|
@ -47,7 +47,7 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
|
||||||
resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster
|
resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster
|
||||||
|
|
||||||
# current year used when creating graph / by some ranking methods
|
# current year used when creating graph / by some ranking methods
|
||||||
currentYear=2024
|
currentYear=2023
|
||||||
|
|
||||||
# Alpha value for pagerank
|
# Alpha value for pagerank
|
||||||
pageRankAlpha=0.5
|
pageRankAlpha=0.5
|
||||||
|
|
28
dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
Normal file → Executable file
28
dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/python
|
||||||
# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
|
# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
|
||||||
# and uses this mapping to create doi-based score files in the format required by BiP! DB.
|
# and uses this mapping to create doi-based score files in the format required by BiP! DB.
|
||||||
# This is done by reading each openaire-id based ranking file and joining the openaire based
|
# This is done by reading each openaire-id based ranking file and joining the openaire based
|
||||||
|
@ -17,28 +18,35 @@ import pyspark.sql.functions as F
|
||||||
# from pyspark.sql.functions import udf
|
# from pyspark.sql.functions import udf
|
||||||
#################################################################################################
|
#################################################################################################
|
||||||
#################################################################################################
|
#################################################################################################
|
||||||
# Clean up directory name
|
# Clean up directory name - no longer needed in final workflow version
|
||||||
|
'''
|
||||||
def clean_directory_name(dir_name):
|
def clean_directory_name(dir_name):
|
||||||
# We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_*
|
# We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_*
|
||||||
# and we need to keep the parts in *
|
# and we need to keep the parts in *
|
||||||
|
|
||||||
|
|
||||||
dir_name_parts = dir_name.split('_')
|
dir_name_parts = dir_name.split('_')
|
||||||
dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
|
dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
|
||||||
|
|
||||||
clean_name = '_'.join(dir_name_parts)
|
dir_name = dir_name.replace("openaire_id_graph", "openaire_ids")
|
||||||
|
clean_name = dir_name + ".txt.gz"
|
||||||
|
|
||||||
if '_ids' not in clean_name:
|
# clean_name = '_'.join(dir_name_parts)
|
||||||
clean_name = clean_name.replace('id_', 'ids_')
|
|
||||||
|
# if '_ids' not in clean_name:
|
||||||
|
# clean_name = clean_name.replace('id_', 'ids_')
|
||||||
|
|
||||||
# clean_name = clean_name.replace('.txt', '')
|
# clean_name = clean_name.replace('.txt', '')
|
||||||
# clean_name = clean_name.replace('.gz', '')
|
# clean_name = clean_name.replace('.gz', '')
|
||||||
|
|
||||||
if 'openaire_ids_' in clean_name:
|
# if 'openaire_ids_' in clean_name:
|
||||||
clean_name = clean_name.replace('openaire_ids_', '')
|
# clean_name = clean_name.replace('openaire_ids_', '')
|
||||||
# clean_name = clean_name + '.txt.gz'
|
# clean_name = clean_name + '.txt.gz'
|
||||||
# else:
|
# else:
|
||||||
# clean_name = clean_name + '.txt.gz'
|
# clean_name = clean_name + '.txt.gz'
|
||||||
|
|
||||||
return clean_name
|
return clean_name
|
||||||
|
'''
|
||||||
#################################################################################################
|
#################################################################################################
|
||||||
if len(sys.argv) < 3:
|
if len(sys.argv) < 3:
|
||||||
print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
|
print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
|
||||||
|
@ -47,12 +55,12 @@ if len(sys.argv) < 3:
|
||||||
# Read arguments
|
# Read arguments
|
||||||
synonyms_folder = sys.argv[1]
|
synonyms_folder = sys.argv[1]
|
||||||
num_partitions = int(sys.argv[2])
|
num_partitions = int(sys.argv[2])
|
||||||
input_file_list = [argument for argument in sys.argv[3:]]
|
input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]]
|
||||||
input_file_list = [clean_directory_name(item) for item in input_file_list]
|
# input_file_list = [clean_directory_name(item) for item in input_file_list]
|
||||||
|
|
||||||
# Prepare output specific variables
|
# Prepare output specific variables
|
||||||
output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
|
output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
|
||||||
output_file_list = [item + ".gz" if not item.endswith(".gz") else item for item in output_file_list]
|
output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list]
|
||||||
|
|
||||||
# --- INFO MESSAGES --- #
|
# --- INFO MESSAGES --- #
|
||||||
print ("\n\n----------------------------")
|
print ("\n\n----------------------------")
|
||||||
|
|
Loading…
Reference in New Issue