From c6e39b7f334d0fa51c56c302511b697b207b454d Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 14 Mar 2023 18:50:54 +0200 Subject: [PATCH 01/41] Add dhp-impact-indicators --- .../dhp-impact-indicators/README.txt | 13 ++++++ dhp-workflows/dhp-impact-indicators/pom.xml | 41 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 dhp-workflows/dhp-impact-indicators/README.txt create mode 100644 dhp-workflows/dhp-impact-indicators/pom.xml diff --git a/dhp-workflows/dhp-impact-indicators/README.txt b/dhp-workflows/dhp-impact-indicators/README.txt new file mode 100644 index 000000000..788534c02 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/README.txt @@ -0,0 +1,13 @@ + + +## Checkout a specific release of the BIP-Ranker git repository + +* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out. + +* Then perform the checkout with: + +``` +mvn scm:checkout +``` + +* The code should be visible under `src/main/bip-ranker` folder. \ No newline at end of file diff --git a/dhp-workflows/dhp-impact-indicators/pom.xml b/dhp-workflows/dhp-impact-indicators/pom.xml new file mode 100644 index 000000000..b827f42a4 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/pom.xml @@ -0,0 +1,41 @@ + + + 4.0.0 + + eu.dnetlib.dhp + dhp + 1.2.5-SNAPSHOT + ../pom.xml + + + dhp-impact-indicators + + + 8 + 8 + UTF-8 + + + + https://github.com/athenarc/Bip-Ranker + https://github.com/athenarc/Bip-Ranker.git + + + + + + org.apache.maven.plugins + maven-scm-plugin + 1.8.1 + + connection + 2 + tag + ${project.build.directory}/../src/main/bip-ranker + + + + + \ No newline at end of file From 720fd19b3957bdc4d746778b7ab2fc306c4b2d14 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 14 Mar 2023 19:28:27 +0200 Subject: [PATCH 02/41] Add dhp-impact-indicators workflow files --- dhp-workflows/dhp-impact-indicators/README.md | 23 + .../dhp-impact-indicators/README.txt | 13 - dhp-workflows/dhp-impact-indicators/pom.xml | 6 +- .../create_openaire_ranking_graph.py | 234 ++++++ .../main/resources/format_ranking_results.py | 770 ++++++++++++++++++ .../src/main/resources/get_ranking_files.sh | 14 + .../src/main/resources/job.properties | 86 ++ .../resources/map_openaire_ids_to_dois.py | 60 ++ .../src/main/resources/map_scores_to_dois.py | 145 ++++ .../src/main/resources/workflow.xml | 600 ++++++++++++++ 10 files changed, 1935 insertions(+), 16 deletions(-) create mode 100644 dhp-workflows/dhp-impact-indicators/README.md delete mode 100644 dhp-workflows/dhp-impact-indicators/README.txt create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml diff --git a/dhp-workflows/dhp-impact-indicators/README.md b/dhp-workflows/dhp-impact-indicators/README.md new file mode 100644 index 000000000..14f489da3 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/README.md @@ -0,0 +1,23 @@ +# Ranking Workflow for Openaire Publications + +This project contains the files for running a paper ranking workflow on the openaire graph using apache oozie. +All scripts are written in python and the project setup follows the typical oozie workflow structure: + +- a workflow.xml file containing the workflow specification +- a job.properties file specifying parameter values for the parameters used by the workflow +- a set of python scripts used by the workflow + +**NOTE**: the workflow depends on the external library of ranking scripts called BiP! Ranker. +You can check out a specific tag/release of BIP! Ranker using maven, as described in the following section. + +## Check out a specific tag/release of BIP-Ranker + +* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out. + +* Then, use maven to perform the checkout: + +``` +mvn scm:checkout +``` + +* The code should be visible under `src/main/bip-ranker` folder. \ No newline at end of file diff --git a/dhp-workflows/dhp-impact-indicators/README.txt b/dhp-workflows/dhp-impact-indicators/README.txt deleted file mode 100644 index 788534c02..000000000 --- a/dhp-workflows/dhp-impact-indicators/README.txt +++ /dev/null @@ -1,13 +0,0 @@ - - -## Checkout a specific release of the BIP-Ranker git repository - -* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out. - -* Then perform the checkout with: - -``` -mvn scm:checkout -``` - -* The code should be visible under `src/main/bip-ranker` folder. \ No newline at end of file diff --git a/dhp-workflows/dhp-impact-indicators/pom.xml b/dhp-workflows/dhp-impact-indicators/pom.xml index b827f42a4..b510635a6 100644 --- a/dhp-workflows/dhp-impact-indicators/pom.xml +++ b/dhp-workflows/dhp-impact-indicators/pom.xml @@ -20,7 +20,7 @@ https://github.com/athenarc/Bip-Ranker - https://github.com/athenarc/Bip-Ranker.git + scm:git:https://github.com/athenarc/Bip-Ranker.git @@ -31,8 +31,8 @@ 1.8.1 connection - 2 - tag + tag + v1.0.0 ${project.build.directory}/../src/main/bip-ranker diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py new file mode 100644 index 000000000..4cffa86a3 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py @@ -0,0 +1,234 @@ +#!/usr/bin/python3 + +# Create openaire id - openaire id graph from openaire data + +############################################################################################################# +# Program proceeds as follows: +# 1. We read the input folder provided from hdfs. +# This contains subfolders with openaire graph objects and openaire graph relations +# 2. We select all openaire graph objects of interest. We filter out based on visibility +# and inference criteria. We also filter out based on the availability of publication year +# 3. Get reference type dataframes from openaire. Then filter each one of them based on the +# existence of citing and cited in the above filtered dataset. Get only citations +# produced by publication objects, or otherresearchproducts of types: +# [TBD] +# 4. Get objects that don't appear in the relations (from those gathered in step 1) and add +# them to the graph +# 5. Group relations by citing paper and do graph-specific formatting +############################################################################################################# +# ---------- Imports ------------- # +import sys +# import pyspark +# from pyspark import SparkConf, SparkContext +from pyspark.sql import SparkSession +# Functions to effectively handle data +# manipulation for DataFrames +import pyspark.sql.functions as F +# Diagnostics +from timeit import default_timer as timer +# from datetime import timedelta, datetime +# -------------------------------- # + +if len(sys.argv) < 5: + print ("Usage: ./create_openaire_ranking_graph.py ") + sys.exit(0) + +# Inputs will be: + +# 1. Folder where openaire graph is stored +graph_folder = sys.argv[1] +# 2. Current year (this will be needed for filtering) +current_year = int(sys.argv[2]) +# 3. Number of partitions +num_partitions = int(sys.argv[3]) +# 4. where to write output +output_folder = sys.argv[4] + +# Lists of results types we want to inclued in the citations +# valid_result_types = ['publication', 'other'] +valid_result_types = ['publication'] +# list of types in otherresearchproduct which are considered valid for citations +valid_other = [''] + +# Create the spark session +spark = SparkSession.builder.appName('oa ranking graph creation').getOrCreate() +# Set context level logging to WARN +spark.sparkContext.setLogLevel("WARN") + +############################################################################################################################ +# 1. Get the research objects and filter based on conditions. +# These will also be the unique identifiers we should find in the final graph + +# Initialize an empty dataframe +oa_objects_df = None + +# There is a directory structure on hdfs under the provided path. +# We need to parse data from the folders: ["publication", "dataset", "software", "otherresearchproduct"] +# which are rankable oa result objects. + +# Loop subfolders +for sub_folder in ["publication", "dataset", "software", "otherresearchproduct"]: + # Read the json data of the graph into a dataframe initially + if not oa_objects_df: + oa_objects_df = spark.read.json(graph_folder + "/" + sub_folder).select('id', 'resulttype.classname', 'datainfo.deletedbyinference', 'datainfo.invisible', F.year('dateofacceptance.value').alias('year')) + oa_objects_df = oa_objects_df.where( 'datainfo.deletedbyinference = false' ).where( 'datainfo.invisible = false' ).repartition(num_partitions, 'id').cache() + # If we already have data, simply add more to it + else: + sub_df = spark.read.json(graph_folder + "/" + sub_folder).select('id', 'resulttype.classname','datainfo.deletedbyinference', 'datainfo.invisible', F.year('dateofacceptance.value').alias('year')) + sub_df = sub_df.where( 'datainfo.deletedbyinference = false ' ).where( 'datainfo.invisible = false ').cache() + # Add the data to the openaire objects dataframe + oa_objects_df = oa_objects_df.union(sub_df).repartition(num_partitions, 'id').cache() + # Clear memory + sub_df.unpersist(True) + +# Remove those records without year +oa_objects_df = oa_objects_df.where(F.col('year').isNotNull()) + + +# Now replace years where > (current_year+1) with 0 +oa_objects_df = oa_objects_df.withColumn('clean_year', F.when(F.col('year').cast('int') > (current_year+1), 0).otherwise(F.col('year')))\ + .drop('year').withColumnRenamed('clean_year', 'year').repartition(num_partitions, 'id') + +# -------------------------------------------------------------------- # +''' +# Some diagnostics +print ("Min and max years:" ) +oa_objects_df.select(F.max('year')).show() +oa_objects_df.select(F.min('year')).show() + +# This should be slow due to not repartitioning by year +print ("Distinct years:") +oa_objects_df.select('year').distinct().sort(F.col('year')).show(5000, False) + +# Show distinct values of deletedbyinference and invisible to ensure we have the correct data +print ("Distinct deleted by inference:") +oa_objects_df.select('deletedbyinference').distinct().show() +print ("Distinct invisible values:") +oa_objects_df.select('invisible').distinct().show() + +# Output total count +print ("Total num of research objects: " + str(oa_objects_df.count())) +''' +# -------------------------------------------------------------------- # + +# Keep only required fields - we still keep resulttype.classname to +# filter the citation relationships we consider valid +oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache() +############################################################################################################################ +# 2. Get the relation objects and filter them based on their existence in the oa_objects_df +# NOTE: we are only interested in citations of type "cites" +# Further, we + +# Deprecated line +# references_df = spark.read.json(graph_folder + "/relation").select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass')\ +# .where( 'relClass = "References"' ).repartition(num_partitions, 'citing').drop('relClass') +# print ("References df has: " + str(references_df.count()) + " entries") + +# Collect only valid citations i.e., invisible = false & deletedbyinference=false +cites_df = spark.read.json(graph_folder + "/relation")\ + .select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\ + .where( (F.col('relClass') == "Cites") \ + & (F.col('dataInfo.deletedbyinference') == "false")\ + & (F.col('dataInfo.invisible') == "false"))\ + .drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\ + .repartition(num_partitions, 'citing').drop('relClass') +# print ("Cited df has: " + str(cites_df.count()) + " entries") + +# DEPRECATED +# cited_by_df = spark.read.json(graph_folder + "/relation").select(F.col('target').alias('citing'), F.col('source').alias('cited'), 'relClass')\ +# .where( 'relClass = "IsCitedBy"' ).repartition(num_partitions, 'citing').drop('relClass') +# print ("Cited by df has: " + str(cited_by_df.count()) + " entries") + +# DEPRECATED +# Keep only relations where citing and cited are in the oa_objects_df +# references_df = references_df.join(oa_objects_df.select('id'), references_df.citing == oa_objects_df.id).drop('id') +# references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache() +# print ("References df now has: " + str(references_df.count()) + " entries") + +cites_df = cites_df.join(oa_objects_df.select('id'), cites_df.citing == oa_objects_df.id).where( F.col('resulttype.classname').isin(valid_result_types) ).drop('id').drop('resulttype.classname') +cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).drop('id').drop('resulttype.classname').distinct().repartition(num_partitions, 'citing').cache() +# TODO: add here a clause filtering out the citations +# originating from "other" types of research objects which we consider valid + +# print ("Cites df now has: " + str(cites_df.count()) + " entries") + +# DEPRECATED +# cited_by_df = cited_by_df.join(oa_objects_df.select('id'), cited_by_df.citing == oa_objects_df.id).drop('id') +# cited_by_df = cited_by_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cited_by_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache() +# print ("Cited BY df now has: " + str(cited_by_df.count()) + " entries") + +# DEPRECATED +# Join all the above into a single set +# citations_df = references_df.union(cites_df).distinct().repartition(num_partitions, 'citing').cache() +# Free space +# references_df.unpersist(True) +# cites_df.unpersist(True) + +# citations_df = citations_df.union(cited_by_df).distinct().repartition(num_partitions, 'citing').cache() + +# ALL citations we keep are in the cited_df dataframe +citations_df = cites_df + +''' +# Show schema +print ("Citation schema:") +citations_df.printSchema() +print ("Objects schema:") +oa_objects_df.printSchema() +''' + +# Free space +# cited_by_df.unpersist(True) + +# Show total num of unique citations +num_unique_citations = citations_df.count() +print ("Total unique citations: " + str(num_unique_citations)) +############################################################################################################################ +# 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references) +dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\ + .select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing') +# Count dangling nodes +dangling_num = dangling_nodes.count() +print ("Number of dangling nodes: " + str(dangling_num)) +# print ("Dangling nodes sample:") +# dangling_nodes.show(10, False) +############################################################################################################################ +# 4. Group the citation dataframe by citing doi, and create the cited dois list. Add dangling nodes to the result +graph = citations_df.groupBy('citing').agg(F.collect_set('cited').alias('cited')).repartition(num_partitions, 'citing').cache() +# Free space +citations_df.unpersist(True) + +num_nodes = graph.count() +print ("Entries in graph before dangling nodes:" + str(num_nodes)) +# print ("Sample in graph: ") +# graph.show(10, False) + +# Add dangling nodes +graph = graph.union(dangling_nodes).repartition(num_partitions, 'citing') +# Count current number of results +num_nodes = graph.count() +print ("Num entries after adding dangling nodes: " + str(num_nodes)) + +# Add publication year +graph = graph.join(oa_objects_df, graph.citing == oa_objects_df.id).select('citing', 'cited', 'year').cache() +num_nodes_final = graph.count() +print ("After adding year: " + str(num_nodes_final)) +# print ("Graph sample:") +# graph.show(20, False) +# Calculate initial score of nodes (1/N) +initial_score = float(1)/float(num_nodes_final) +############################################################################################################################ +# 5. Write graph to output file! +print("Writing output to: " + output_folder) + +graph.select('citing', F.concat_ws("|", F.concat_ws(",",'cited'), F.when(F.col('cited').getItem(1) != "0", F.size('cited')).otherwise(F.lit("0")), F.lit(str(initial_score)) ).alias('cited'), 'year').withColumn('prev_pr', F.lit("0")).select('citing', 'cited', 'prev_pr', 'year')\ + .write.mode("overwrite").option("delimiter","\t").csv(output_folder, compression="gzip") + +if num_nodes_final != num_nodes: + print ("WARNING: the number of nodes after keeping only nodes where year is available went from: " + str(num_nodes) + " to " + str(num_nodes_final) + "\n") + print ("Check for any mistakes...") + +############################################################################################################################ +print ("\nDONE!\n\n") +# Wrap up +spark.stop() diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py new file mode 100644 index 000000000..60c71e52f --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py @@ -0,0 +1,770 @@ +# This program reads hdfs directories containing ranking results from openaire's cluster. +# Based on the parameters provided by the user, it will create different types of output files. + +# Modes available are: +# 1. bip +# This will result in output of the form required for bip-finder's update. +# Its lines conform to the following format: +# \t \t \t \t \t \t \t <3y_cc> \t <3y_cc_normalized> \t \t + +# 2. zenodo +# This is the format used in zenodo for Bip-DB. (6 way classes will be named C1, C2, ..., C6) +# This should output two files per ranking method with each line having the following data: +# a. <6-way-class> +# NOTE: this should also run for openaire-id files, hence we should have a total of 4 files per ranking (2 for each type of identifier) +# In 'zenodo' mode the user specifies only a single file, for which zenodo-based output will be created + +# 3. json +# This if the format used to provide openAIRE / claudio with data containing 1 json per identifier +# An example of such a json format follows: +#{ +# "50|dedup_wf_001::08823c8f5c3ca2eae523817036cdda67": [ +# { +# "id": "influence", +# "unit": [ +# { +# "key": "score", +# "value": "5.06690394631e-09" +# }, +# { +# "key": "class", +# "value": "C" +# } +# ] +# }, +# { +# "id": "popularity_alt", +# "unit": [ +# { +# "key": "score", +# "value": "0.0" +# }, +# { +# "key": "class", +# "value": "C" +# } +# ] +# }, +# { +# "id": "popularity", +# "unit": [ +# { +# "key": "score", +# "value": "3.11855618382e-09" +# }, +# { +# "key": "class", +# "value": "C" +# } +# ] +# }, +# { +# "id": "influence_alt", +# "unit": [ +# { +# "key": "score", +# "value": "0.0" +# }, +# { +# "key": "class", +# "value": "C" +# } +# ] +# }, +# { +# "id": "impulse", +# "unit": [ +# { +# "key": "score", +# "value": "0.0" +# }, +# { +# "key": "class", +# "value": "C" +# } +# ] +# } +# ] +#} + + +################################################################################################# +# Imports +import sys +import time + +# Sparksession lib to communicate with cluster via session object +from pyspark.sql import SparkSession + +# Import sql types to define the schema of score output files +from pyspark.sql.types import * + +# Import sql functions with shorthand alias +import pyspark.sql.functions as F +from pyspark.sql.functions import udf + +# Json specific encoding +import json +################################################################################################# +# Clean up directory name +def clean_directory_name(dir_name): + # We have a name with the form *_bip_universe_* or *_graph_universe_* + # and we need to keep the parts in * + dir_name_parts = dir_name.split('_') + dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)] + + clean_name = '_'.join(dir_name_parts) + clean_name = clean_name.replace('_id', '_ids') + + clean_name = clean_name.replace('.txt', '') + clean_name = clean_name.replace('.gz', '') + + if 'openaire_ids_' in clean_name: + clean_name = clean_name.replace('openaire_ids_', '') + clean_name = clean_name + '_openaire_ids.txt.gz' + else: + clean_name = clean_name + '.txt.gz/' + + return clean_name +# --------------------------------------------------------------------------------------------- # +# User defined function to escape special characters in a string that will turn into a json key +@udf(StringType()) +def json_encode_key(doi_string): + return json.dumps(doi_string) +################################################################################################# +# --------------------------------------------------------------------------------------------- # +# Arguments from command line and initializations + +# Time initialization +start_time = time.time() + +# Check whether input is correct, otherwise exit with appropriate message +if len(sys.argv) < 2: + print ("Usage: ./format_ranking_results.py ") + sys.exit(0) + +# Define valid modes: +valid_modes = ['json', 'zenodo', 'bip', 'json-5-way'] +# Read mode provided by user +mode = sys.argv[1].strip() + +# If mode isn't valid, exit +if mode not in valid_modes: + print ("Usage: ./format_ranking_results.py \n") + print ("Invalid mode provided. Valid modes: ['zenodo', 'bip', 'json', 'json-5-way']") + sys.exit(0) + + +# Once here, we should be more or less okay to run. + +# Define the spark session object +spark = SparkSession.builder.appName('Parse Scores - ' + str(mode) + ' mode').getOrCreate() +# Set Log Level for spark session +spark.sparkContext.setLogLevel('WARN') + +# Here we define the schema shared by all score output files +# - citation count variants have a slightly different schema, due to their scores being integers +float_schema = StructType([ + StructField('id', StringType(), False), + StructField('score', FloatType(), False), + StructField('normalized_score', FloatType(), False), + StructField('3-way-class', StringType(), False), + StructField('5-way-class', StringType(), False) + ]) + +int_schema = StructType([ + StructField('id', StringType(), False), + StructField('score', IntegerType(), False), + StructField('normalized_score', FloatType(), False), + StructField('3-way-class', StringType(), False), + StructField('5-way-class', StringType(), False) + ]) + +# This schema concerns the output of the file +# containing the number of references of each doi +refs_schema = StructType([ + StructField('id', StringType(), False), + StructField('num_refs', IntegerType(), False), + ]) + +print("--- Initialization time: %s seconds ---" % (time.time() - start_time)) + +# --------------------------------------------------------------------------------------------- # + +# Time the main program execution +start_time = time.time() + +# The following is executed when the user requests the bip-update specific file +if mode == 'bip': + + # Read the remaining input files + if len(sys.argv) < 8: + print ("\n\nInsufficient input for 'bip' mode.") + print ("File list required: <3-year citation count> \n") + sys.exit(0) + + + # Read number of partitions: + num_partitions = int(sys.argv[-1]) + + + pagerank_dir = sys.argv[2] + attrank_dir = sys.argv[3] + cc_dir = sys.argv[4] + impulse_dir = sys.argv[5] + ram_dir = sys.argv[6] + refs_dir = sys.argv[7] + + # Score-specific dataframe + pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id') + attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id') + cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id') + impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id') + ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id') + refs_df = spark.read.schema(refs_schema).option('delimiter', '\t').option('header',True).csv(refs_dir).repartition(num_partitions, 'id') + + # ----------- TESTING CODE --------------- # + # pagerank_entries = pagerank_df.count() + # attrank_entries = attrank_df.count() + # cc_entries = cc_df.count() + # impulse_entries = impulse_df.count() + # ram_entries = ram_df.count() + # refs_entries = refs_df.count() + + # print ("Pagerank:" + str(pagerank_entries)) + # print ("AttRank:" + str(attrank_entries)) + # print ("CC entries: " + str(cc_entries)) + # print ("Impulse entries: " + str(impulse_entries)) + # print ("Refs: " + str(refs_entries)) + # ---------------------------------------- # + + # Create a new dataframe with the required data + results_df = pagerank_df.select('id', F.col('score').alias('pagerank'), F.col('normalized_score').alias('pagerank_normalized')) + # Add attrank dataframe + results_df = results_df.join(attrank_df.select('id', 'score', 'normalized_score'), ['id'])\ + .select(results_df.id, 'pagerank', 'pagerank_normalized', F.col('score').alias('attrank'), F.col('normalized_score').alias('attrank_normalized')) + + # Add citation count dataframe + results_df = results_df.join(cc_df.select('id', 'score', 'normalized_score'), ['id'])\ + .select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', F.col('score').alias('cc'), F.col('normalized_score').alias('cc_normalized')) + + # Add 3-year df + results_df = results_df.join(impulse_df.select('id', 'score', 'normalized_score'), ['id'])\ + .select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', 'cc', 'cc_normalized', \ + F.col('score').alias('3-cc'), F.col('normalized_score').alias('3-cc_normalized')) + + # Add ram df + results_df = results_df.join(ram_df.select('id', 'score'), ['id'])\ + .select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', 'cc', 'cc_normalized',\ + '3-cc', '3-cc_normalized', F.col('score').alias('ram')) + + # Add references + results_df = results_df.join(refs_df, ['id']).select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', \ + 'cc', 'cc_normalized', '3-cc', '3-cc_normalized', 'ram', 'num_refs') + + # Write resulting dataframe to file + output_dir = "/".join(pagerank_dir.split('/')[:-1]) + output_dir = output_dir + '/bip_update_data.txt.gz' + + print("Writing to:" + output_dir) + results_df.write.mode('overwrite').option('delimiter','\t').option('header',True).csv(output_dir, compression='gzip') + +# The following is executed when the user requests the zenodo-specific file +elif mode == 'zenodo': + + # Read the remaining input files + if len(sys.argv) < 9: + print ("\n\nInsufficient input for 'zenodo' mode.") + print ("File list required: <3-year citation count> \n") + sys.exit(0) + + # Read number of partitions: + num_partitions = int(sys.argv[-2]) + graph_type = sys.argv[-1] + + if graph_type not in ['bip', 'openaire']: + graph_type = 'bip' + + pagerank_dir = sys.argv[2] + attrank_dir = sys.argv[3] + cc_dir = sys.argv[4] + impulse_dir = sys.argv[5] + ram_dir = sys.argv[6] + + # Output directory is common for all files + output_dir_prefix = "/".join(pagerank_dir.split('/')[:-1]) + # Method-specific outputs + pagerank_output = clean_directory_name(pagerank_dir.split('/')[-1]) + attrank_output = clean_directory_name(attrank_dir.split('/')[-1]) + cc_output = clean_directory_name(cc_dir.split('/')[-1]) + impulse_output = clean_directory_name(impulse_dir.split('/')[-1]) + ram_output = clean_directory_name(ram_dir.split('/')[-1]) + + # --------- PageRank ----------- # + # Get per file the doi - score - 6-way classes and write it to output + print("Writing to: " + output_dir_prefix + '/' + pagerank_output) + pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class') + # Replace dataframe class names + pagerank_df = pagerank_df.withColumn('class', F.lit('C6')) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.drop('5-way-class') + + if graph_type == 'openaire': + pagerank_df = pagerank_df.where( ~F.col('id').like('10.%') ) + + # Write output + pagerank_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + pagerank_output, compression='gzip') + # --------- AttRank ----------- # + print("Writing to: " + output_dir_prefix + '/' + attrank_output) + attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class') + # Replace dataframe class names + attrank_df = attrank_df.withColumn('class', F.lit('C6')) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) ) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + attrank_df = attrank_df.drop('5-way-class') + + if graph_type == 'openaire': + attrank_df = attrank_df.where( ~F.col('id').like('10.%') ) + + # Write output + attrank_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + attrank_output, compression='gzip') + # --------- Citation Count ----------- # + print("Writing to: " + output_dir_prefix + '/' + cc_output) + cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class') + # Replace dataframe class names + cc_df = cc_df.withColumn('class', F.lit('C5')) + # cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) ) + cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + cc_df = cc_df.drop('5-way-class') + + if graph_type == 'openaire': + cc_df = cc_df.where( ~F.col('id').like('10.%') ) + + # Write output + cc_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + cc_output, compression='gzip') + # --------- Impulse ----------- # + print("Writing to: " + output_dir_prefix + '/' + impulse_output) + impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class') + # Replace dataframe class names + impulse_df = impulse_df.withColumn('class', F.lit('C5')) + # impulse_df = impulse_df.withColumn('class', F.when(F.col('6-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) ) + impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + impulse_df = impulse_df.drop('5-way-class') + + if graph_type == 'openaire': + impulse_df = impulse_df.where( ~F.col('id').like('10.%') ) + + # Write output + impulse_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + impulse_output, compression='gzip') + # --------- RAM ----------- # + print("Writing to: " + output_dir_prefix + '/' + ram_output) + ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class') + # Replace dataframe class names + ram_df = ram_df.withColumn('class', F.lit('C5')) + # ram_df = ram_df.withColumn('class', F.when(F.col('6-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) ) + ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + ram_df = ram_df.drop('5-way-class') + + if graph_type == 'openaire': + ram_df = ram_df.where( ~F.col('id').like('10.%') ) + + # Write output + ram_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + ram_output, compression='gzip') + +# The following produces the json file required by openaire +elif mode == 'json': + + # Read the remaining input files + if len(sys.argv) < 9: + print ("\n\nInsufficient input for 'json' mode.") + print ("File list required: <3-year citation count> \n") + sys.exit(0) + + # Read number of partitions: + num_partitions = int(sys.argv[-2]) + graph_type = sys.argv[-1] + + if graph_type not in ['bip', 'openaire']: + graph_type = 'bip' + + print ("Graph type: " + str(graph_type)) + + # File directories + pagerank_dir = sys.argv[2] + attrank_dir = sys.argv[3] + cc_dir = sys.argv[4] + impulse_dir = sys.argv[5] + ram_dir = sys.argv[6] + + print ("Reading files:") + print (pagerank_dir) + print (attrank_dir) + print (cc_dir) + print (impulse_dir) + print (ram_dir) + + # Score-specific dataframe - read inputs + pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id') + attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',False).csv(attrank_dir).repartition(num_partitions, 'id') + cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id') + impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id') + ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id') + # --- Join the data of the various scores --- # + + # Create json data for pagerank + pagerank_df = pagerank_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map')) + + pagerank_df = pagerank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_values') ) + pagerank_df = pagerank_df.select('id', F.create_map(F.lit('id'), F.lit('influence')).alias('id_map'), F.col('influence_values')) + pagerank_df = pagerank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence'))).alias('influence_key'), F.to_json(F.col('influence_values')).alias('influence_values') ) + pagerank_df = pagerank_df.select('id', F.expr('substring(influence_key, 0, length(influence_key)-1)').alias('influence_key'), 'influence_values') + pagerank_df = pagerank_df.select('id', 'influence_key', F.expr('substring(influence_values, 2, length(influence_values))').alias('influence_values')) + pagerank_df = pagerank_df.select('id', F.concat_ws(', ', F.col('influence_key'), F.col('influence_values')).alias('influence_json')) + + # Create json data for attrank + attrank_df = attrank_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map')) + + attrank_df = attrank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_values') ) + attrank_df = attrank_df.select('id', F.create_map(F.lit('id'), F.lit('popularity')).alias('id_map'), F.col('popularity_values')) + attrank_df = attrank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity'))).alias('popularity_key'), F.to_json(F.col('popularity_values')).alias('popularity_values') ) + attrank_df = attrank_df.select('id', F.expr('substring(popularity_key, 0, length(popularity_key)-1)').alias('popularity_key'), 'popularity_values') + attrank_df = attrank_df.select('id', 'popularity_key', F.expr('substring(popularity_values, 2, length(popularity_values))').alias('popularity_values')) + attrank_df = attrank_df.select('id', F.concat_ws(', ', F.col('popularity_key'), F.col('popularity_values')).alias('popularity_json')) + + # Create json data for CC + cc_df = cc_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map')) + + cc_df = cc_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_alt_values') ) + cc_df = cc_df.select('id', F.create_map(F.lit('id'), F.lit('influence_alt')).alias('id_map'), F.col('influence_alt_values')) + cc_df = cc_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence_alt'))).alias('influence_alt_key'), F.to_json(F.col('influence_alt_values')).alias('influence_alt_values') ) + cc_df = cc_df.select('id', F.expr('substring(influence_alt_key, 0, length(influence_alt_key)-1)').alias('influence_alt_key'), 'influence_alt_values') + cc_df = cc_df.select('id', 'influence_alt_key', F.expr('substring(influence_alt_values, 2, length(influence_alt_values))').alias('influence_alt_values')) + cc_df = cc_df.select('id', F.concat_ws(', ', F.col('influence_alt_key'), F.col('influence_alt_values')).alias('influence_alt_json')) + + + # Create json data for RAM + ram_df = ram_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map')) + + ram_df = ram_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_alt_values') ) + ram_df = ram_df.select('id', F.create_map(F.lit('id'), F.lit('popularity_alt')).alias('id_map'), F.col('popularity_alt_values')) + ram_df = ram_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity_alt'))).alias('popularity_alt_key'), F.to_json(F.col('popularity_alt_values')).alias('popularity_alt_values') ) + ram_df = ram_df.select('id', F.expr('substring(popularity_alt_key, 0, length(popularity_alt_key)-1)').alias('popularity_alt_key'), 'popularity_alt_values') + ram_df = ram_df.select('id', 'popularity_alt_key', F.expr('substring(popularity_alt_values, 2, length(popularity_alt_values))').alias('popularity_alt_values')) + ram_df = ram_df.select('id', F.concat_ws(', ', F.col('popularity_alt_key'), F.col('popularity_alt_values')).alias('popularity_alt_json')) + + # Create json data for impulse + impulse_df = impulse_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map')) + + impulse_df = impulse_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('impulse_values') ) + impulse_df = impulse_df.select('id', F.create_map(F.lit('id'), F.lit('impulse')).alias('id_map'), F.col('impulse_values')) + impulse_df = impulse_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('impulse'))).alias('impulse_key'), F.to_json(F.col('impulse_values')).alias('impulse_values') ) + impulse_df = impulse_df.select('id', F.expr('substring(impulse_key, 0, length(impulse_key)-1)').alias('impulse_key'), 'impulse_values') + impulse_df = impulse_df.select('id', 'impulse_key', F.expr('substring(impulse_values, 2, length(impulse_values))').alias('impulse_values')) + impulse_df = impulse_df.select('id', F.concat_ws(', ', F.col('impulse_key'), F.col('impulse_values')).alias('impulse_json')) + + #Join dataframes together + results_df = pagerank_df.join(attrank_df, ['id']) + results_df = results_df.join(cc_df, ['id']) + results_df = results_df.join(ram_df, ['id']) + results_df = results_df.join(impulse_df, ['id']) + + print ("Json encoding DOI keys") + # Json encode doi strings + results_df = results_df.select(json_encode_key('id').alias('id'), 'influence_json', 'popularity_json', 'influence_alt_json', 'popularity_alt_json', 'impulse_json') + + # Concatenate individual json columns + results_df = results_df.select('id', F.concat_ws(', ', F.col('influence_json'), F.col('popularity_json'), F.col('influence_alt_json'), F.col('popularity_alt_json'), F.col('impulse_json') ).alias('json_data')) + results_df = results_df.select('id', F.concat_ws('', F.lit('['), F.col('json_data'), F.lit(']')).alias('json_data') ) + + # Filter out non-openaire ids if need + if graph_type == 'openaire': + results_df = results_df.where( ~F.col('id').like('"10.%') ) + + # Concatenate paper id and add opening and ending brackets + results_df = results_df.select(F.concat_ws('', F.lit('{'), F.col('id'), F.lit(': '), F.col('json_data'), F.lit('}')).alias('json') ) + + # -------------------------------------------- # + # Write json output - set the directory here + output_dir = "/".join(pagerank_dir.split('/')[:-1]) + if graph_type == 'bip': + output_dir = output_dir + '/bip_universe_doi_scores/' + else: + output_dir = output_dir + '/openaire_universe_scores/' + + # Write the dataframe + print ("Writing output to: " + output_dir) + results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip') + + # Rename the files to .json.gz now + sc = spark.sparkContext + URI = sc._gateway.jvm.java.net.URI + Path = sc._gateway.jvm.org.apache.hadoop.fs.Path + FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem + # Get master prefix from input file path + master_prefix = "/".join(pagerank_dir.split('/')[:5]) + fs = FileSystem.get(URI(master_prefix), sc._jsc.hadoopConfiguration()) + path = Path(output_dir) + print ("Path is:" + path.toString()) + file_list = fs.listStatus(Path(output_dir)) + print ("Renaming files:") + for f in file_list: + initial_filename = f.getPath().toString() + if "part" in initial_filename: + print (initial_filename + " => " + initial_filename.replace(".txt.gz", ".json.gz")) + fs.rename(Path(initial_filename), Path(initial_filename.replace(".txt.gz", ".json.gz"))) + + + ''' + DEPRECATED: + # -------------------------------------------- # + # Write json output + output_dir = "/".join(pagerank_dir.split('/')[:-1]) + if graph_type == 'bip': + output_dir = output_dir + '/bip_universe_doi_scores_txt/' + else: + output_dir = output_dir + '/openaire_universe_scores_txt/' + + print ("Writing output to: " + output_dir) + results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip') + print ("Done writing first results") + # Read results df as json and write it as json file + print ("Reading json input from: " + str(output_dir)) + resulds_df_json = spark.read.json(output_dir).cache() + # Write json to different dir + print ("Writing json output to: " + output_dir.replace("_txt", "")) + resulds_df_json.write.mode('overwrite').json(output_dir.replace("_txt", ""), compression='gzip') + ''' + +# The following produces the json file required by openaire +elif mode == 'json-5-way': + + # Read the remaining input files + if len(sys.argv) < 9: + print ("\n\nInsufficient input for 'json-5-way' mode.") + print ("File list required: <3-year citation count> \n") + sys.exit(0) + + # Read number of partitions: + num_partitions = int(sys.argv[-2]) + graph_type = sys.argv[-1] + + if graph_type not in ['bip', 'openaire']: + graph_type = 'bip' + + # File directories + pagerank_dir = sys.argv[2] + attrank_dir = sys.argv[3] + cc_dir = sys.argv[4] + impulse_dir = sys.argv[5] + ram_dir = sys.argv[6] + + # Score-specific dataframe - read inputs + pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id') + attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',False).csv(attrank_dir).repartition(num_partitions, 'id') + cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id') + impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id') + ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id') + # --- Join the data of the various scores --- # + + + # Replace 6-way classes with 5-way values + pagerank_df = pagerank_df.withColumn('class', F.lit('C5')) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + pagerank_df = pagerank_df.drop('5-way-class').withColumnRenamed('class', '5-way-class') + + + # Create json data for pagerank + pagerank_df = pagerank_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map')) + + + + pagerank_df = pagerank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_values') ) + pagerank_df = pagerank_df.select('id', F.create_map(F.lit('id'), F.lit('influence')).alias('id_map'), F.col('influence_values')) + pagerank_df = pagerank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence'))).alias('influence_key'), F.to_json(F.col('influence_values')).alias('influence_values') ) + pagerank_df = pagerank_df.select('id', F.expr('substring(influence_key, 0, length(influence_key)-1)').alias('influence_key'), 'influence_values') + pagerank_df = pagerank_df.select('id', 'influence_key', F.expr('substring(influence_values, 2, length(influence_values))').alias('influence_values')) + pagerank_df = pagerank_df.select('id', F.concat_ws(', ', F.col('influence_key'), F.col('influence_values')).alias('influence_json')) + + # Replace 6-way classes with 5 way classes for attrank + attrank_df = attrank_df.withColumn('class', F.lit('C5')) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + attrank_df = attrank_df.drop('5-way-class').withColumnRenamed('class', '5-way-class') + + # Create json data for attrank + attrank_df = attrank_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map')) + + attrank_df = attrank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_values') ) + attrank_df = attrank_df.select('id', F.create_map(F.lit('id'), F.lit('popularity')).alias('id_map'), F.col('popularity_values')) + attrank_df = attrank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity'))).alias('popularity_key'), F.to_json(F.col('popularity_values')).alias('popularity_values') ) + attrank_df = attrank_df.select('id', F.expr('substring(popularity_key, 0, length(popularity_key)-1)').alias('popularity_key'), 'popularity_values') + attrank_df = attrank_df.select('id', 'popularity_key', F.expr('substring(popularity_values, 2, length(popularity_values))').alias('popularity_values')) + attrank_df = attrank_df.select('id', F.concat_ws(', ', F.col('popularity_key'), F.col('popularity_values')).alias('popularity_json')) + + # Replace 6-way classes with 5 way classes for attrank + cc_df = cc_df.withColumn('class', F.lit('C5')) + cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + cc_df = cc_df.drop('5-way-class').withColumnRenamed('class', '5-way-class') + + # Create json data for CC + cc_df = cc_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map')) + + cc_df = cc_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_alt_values') ) + cc_df = cc_df.select('id', F.create_map(F.lit('id'), F.lit('influence_alt')).alias('id_map'), F.col('influence_alt_values')) + cc_df = cc_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence_alt'))).alias('influence_alt_key'), F.to_json(F.col('influence_alt_values')).alias('influence_alt_values') ) + cc_df = cc_df.select('id', F.expr('substring(influence_alt_key, 0, length(influence_alt_key)-1)').alias('influence_alt_key'), 'influence_alt_values') + cc_df = cc_df.select('id', 'influence_alt_key', F.expr('substring(influence_alt_values, 2, length(influence_alt_values))').alias('influence_alt_values')) + cc_df = cc_df.select('id', F.concat_ws(', ', F.col('influence_alt_key'), F.col('influence_alt_values')).alias('influence_alt_json')) + + # Replace 6-way classes with 5 way classes for attrank + ram_df = ram_df.withColumn('class', F.lit('C5')) + ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + ram_df = ram_df.drop('5-way-class').withColumnRenamed('class', '5-way-class') + + # Create json data for RAM + ram_df = ram_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map')) + + ram_df = ram_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_alt_values') ) + ram_df = ram_df.select('id', F.create_map(F.lit('id'), F.lit('popularity_alt')).alias('id_map'), F.col('popularity_alt_values')) + ram_df = ram_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity_alt'))).alias('popularity_alt_key'), F.to_json(F.col('popularity_alt_values')).alias('popularity_alt_values') ) + ram_df = ram_df.select('id', F.expr('substring(popularity_alt_key, 0, length(popularity_alt_key)-1)').alias('popularity_alt_key'), 'popularity_alt_values') + ram_df = ram_df.select('id', 'popularity_alt_key', F.expr('substring(popularity_alt_values, 2, length(popularity_alt_values))').alias('popularity_alt_values')) + ram_df = ram_df.select('id', F.concat_ws(', ', F.col('popularity_alt_key'), F.col('popularity_alt_values')).alias('popularity_alt_json')) + + # Replace 6-way classes with 5 way classes for attrank + impulse_df = impulse_df.withColumn('class', F.lit('C5')) + impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) ) + impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) ) + impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) ) + impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) ) + impulse_df = impulse_df.drop('5-way-class').withColumnRenamed('class', '5-way-class') + + # Create json data for impulse + impulse_df = impulse_df.select('id', F.map_concat( + F.create_map(F.lit('key'), F.lit('score')), + F.create_map(F.lit('value'), F.col('score'))).alias('score_map'), + F.map_concat( + F.create_map(F.lit('key'), F.lit('class')), + F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map')) + + impulse_df = impulse_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('impulse_values') ) + impulse_df = impulse_df.select('id', F.create_map(F.lit('id'), F.lit('impulse')).alias('id_map'), F.col('impulse_values')) + impulse_df = impulse_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('impulse'))).alias('impulse_key'), F.to_json(F.col('impulse_values')).alias('impulse_values') ) + impulse_df = impulse_df.select('id', F.expr('substring(impulse_key, 0, length(impulse_key)-1)').alias('impulse_key'), 'impulse_values') + impulse_df = impulse_df.select('id', 'impulse_key', F.expr('substring(impulse_values, 2, length(impulse_values))').alias('impulse_values')) + impulse_df = impulse_df.select('id', F.concat_ws(', ', F.col('impulse_key'), F.col('impulse_values')).alias('impulse_json')) + + #Join dataframes together + results_df = pagerank_df.join(attrank_df, ['id']) + results_df = results_df.join(cc_df, ['id']) + results_df = results_df.join(ram_df, ['id']) + results_df = results_df.join(impulse_df, ['id']) + + print ("Json encoding DOI keys") + # Json encode doi strings + results_df = results_df.select(json_encode_key('id').alias('id'), 'influence_json', 'popularity_json', 'influence_alt_json', 'popularity_alt_json', 'impulse_json') + + # Concatenate individual json columns + results_df = results_df.select('id', F.concat_ws(', ', F.col('influence_json'), F.col('popularity_json'), F.col('influence_alt_json'), F.col('popularity_alt_json'), F.col('impulse_json') ).alias('json_data')) + results_df = results_df.select('id', F.concat_ws('', F.lit('['), F.col('json_data'), F.lit(']')).alias('json_data') ) + + # Filter out non-openaire ids if need + if graph_type == 'openaire': + results_df = results_df.where( ~F.col('id').like('10.%') ) + + # Concatenate paper id and add opening and ending brackets + results_df = results_df.select(F.concat_ws('', F.lit('{'), F.col('id'), F.lit(': '), F.col('json_data'), F.lit('}')).alias('json') ) + + # TEST output and count + # results_df.show(20, False) + # print ("Results #" + str(results_df.count())) + + # -------------------------------------------- # + # Write json output + output_dir = "/".join(pagerank_dir.split('/')[:-1]) + if graph_type == 'bip': + output_dir = output_dir + '/bip_universe_doi_scores_5_classes/' + else: + output_dir = output_dir + '/openaire_universe_scores_5_classes/' + + print ("Writing output to: " + output_dir) + results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip') + +# Close spark session +spark.stop() + +print("--- Main program execution time: %s seconds ---" % (time.time() - start_time)) +print("--- Finished --- \n\n") + diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh new file mode 100644 index 000000000..4d0fedba9 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh @@ -0,0 +1,14 @@ +ranking_results_folder=$1; + +pr_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/PR_.*" | grep -o "PR.*"`; +attrank_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/AttRank.*" | grep -o "AttRank.*"`; +cc_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/CC_.*" | grep -o "CC.*"`; +impulse_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/3-year_.*" | grep -o "3-year.*"`; +ram_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/RAM_.*" | grep -o "RAM.*"`; + +echo "pr_file=${pr_file}"; +echo "attrank_file=${attrank_file}"; +echo "cc_file=${cc_file}"; +echo "impulse_file=${impulse_file}"; +echo "ram_file=${ram_file}"; +# echo "TEST=`hdfs dfs -ls ${ranking_results_folder}/`"; diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties new file mode 100644 index 000000000..9ad9def21 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties @@ -0,0 +1,86 @@ +# The following set of properties are defined in https://support.openaire.eu/projects/openaire/wiki/Hadoop_clusters +# and concern the parameterization required for running workflows on the @GARR cluster + +dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos +dhp.hadoop.frontend.user.name=ilias.kanellos +dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl +dhp.hadoop.frontend.port.ssh=22 +oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie +jobTracker=yarnRM +nameNode=hdfs://nameservice1 +oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log +maven.executable=mvn +sparkDriverMemory=7G +sparkExecutorMemory=7G +sparkExecutorCores=4 +# The above is given differently in an example I found online +oozie.action.sharelib.for.spark=spark2 +oozieActionShareLibForSpark2=spark2 +spark2YarnHistoryServerAddress=http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 +spark2EventLogDir=/user/spark/spark2ApplicationHistory +sparkSqlWarehouseDir=/user/hive/warehouse +hiveMetastoreUris=thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 +# This MAY avoid the no library used error +oozie.use.system.libpath=true +# Some stuff copied from openaire's jobs +spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener +spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener + + +# Some stuff copied from openaire's jobs +spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener +spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener + +# ------------------------------------------------------------------------------ # +# The following set of properties are my own custom ones + +# Based on the page linked to at the start of the file, if we use yarn as a resource manager, its address is given as follows +resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster + +# current year used when creating graph / by some ranking methods +currentYear=2024 + +# Alpha value for pagerank +pageRankAlpha=0.5 +# AttRank values +attrankAlpha=0.2 +attrankBeta=0.5 +attrankGamma=0.3 +attrankRho=-0.16 +# attrankCurrentYear=2023 +attrankStartYear=2021 + +# Ram values +ramGamma=0.6 +# ramCurrentYear=2023 + +# Convergence error for pagerank +convergenceError=0.000000000001 + +# I think this should be the oozie workflow directory +oozieWorkflowPath=user/ilias.kanellos/workflow_example/ + +# The directory where the workflow data is/should be stored +workflowDataDir=user/ilias.kanellos/ranking_workflow + +# Directory where dataframes are checkpointed +checkpointDir=${nameNode}/${workflowDataDir}/check/ + +# The directory for the doi-based bip graph +bipGraphFilePath=${nameNode}/${workflowDataDir}/bipdbv8_graph + +# The folder from which synonyms of openaire-ids are read +# openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/ +openaireDataInput=${/tmp/prod_provision/graph/18_graph_blacklisted} + +# A folder where we will write the openaire to doi mapping +synonymFolder=${nameNode}/${workflowDataDir}/openaireid_to_dois/ + +# This will be where we store the openaire graph input. They told us on GARR to use a directory under /data +openaireGraphInputPath=${nameNode}/${workflowDataDir}/openaire_id_graph + +# The workflow application path +wfAppPath=${nameNode}/${oozieWorkflowPath} +# The following is needed as a property of a workflow +oozie.wf.application.path=${wfAppPath} + diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py new file mode 100644 index 000000000..7997eec82 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py @@ -0,0 +1,60 @@ +import json +import sys +from pyspark.sql import SparkSession +from pyspark import SparkConf, SparkContext + +if len(sys.argv) != 3: + print("Usage: map_openaire_ids_to_dois.py ") + sys.exit(-1) + +conf = SparkConf().setAppName('BIP!: Map OpenAIRE IDs to DOIs') +sc = SparkContext(conf = conf) +spark = SparkSession.builder.appName('BIP!: Map OpenAIRE IDs to DOIs').getOrCreate() +sc.setLogLevel('OFF') + +src_dir = sys.argv[1] +output = sys.argv[2] + +# src_dir = "/tmp/beta_provision/graph/21_graph_cleaned/" +# output = '/tmp/openaireid_to_dois/' + +def transform(doc): + + # get publication year from 'doc.dateofacceptance.value' + dateofacceptance = doc.get('dateofacceptance', {}).get('value') + + year = 0 + + if (dateofacceptance is not None): + year = dateofacceptance.split('-')[0] + + # for each pid get 'pid.value' if 'pid.qualifier.classid' equals to 'doi' + dois = [ pid['value'] for pid in doc.get('pid', []) if (pid.get('qualifier', {}).get('classid') == 'doi' and pid['value'] is not None)] + + num_dois = len(dois) + + # exlcude openaire ids that do not correspond to DOIs + if (num_dois == 0): + return None + + fields = [ doc['id'], str(num_dois), chr(0x02).join(dois), str(year) ] + + return '\t'.join([ v.encode('utf-8') for v in fields ]) + +docs = None + +for result_type in ["publication", "dataset", "software", "otherresearchproduct"]: + + tmp = sc.textFile(src_dir + result_type).map(json.loads) + + if (docs is None): + docs = tmp + else: + # append all result types in one RDD + docs = docs.union(tmp) + +docs = docs.filter(lambda d: d.get('dataInfo', {}).get('deletedbyinference') == False and d.get('dataInfo', {}).get('invisible') == False) + +docs = docs.map(transform).filter(lambda d: d is not None) + +docs.saveAsTextFile(output) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py new file mode 100644 index 000000000..0d294e045 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py @@ -0,0 +1,145 @@ +# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow +# and uses this mapping to create doi-based score files in the format required by BiP! DB. +# This is done by reading each openaire-id based ranking file and joining the openaire based +# score and classes to all the corresponding dois. +################################################################################################# +# Imports +import sys + +# Sparksession lib to communicate with cluster via session object +from pyspark.sql import SparkSession + +# Import sql types to define schemas +from pyspark.sql.types import * + +# Import sql functions with shorthand alias +import pyspark.sql.functions as F +# from pyspark.sql.functions import udf +################################################################################################# +################################################################################################# +# Clean up directory name +def clean_directory_name(dir_name): + # We have a name with the form *_bip_universe_* or *_graph_universe_* + # and we need to keep the parts in * + dir_name_parts = dir_name.split('_') + dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)] + + clean_name = '_'.join(dir_name_parts) + + if '_ids' not in clean_name: + clean_name = clean_name.replace('id_', 'ids_') + + # clean_name = clean_name.replace('.txt', '') + # clean_name = clean_name.replace('.gz', '') + + if 'openaire_ids_' in clean_name: + clean_name = clean_name.replace('openaire_ids_', '') + # clean_name = clean_name + '.txt.gz' + # else: + # clean_name = clean_name + '.txt.gz' + + return clean_name +################################################################################################# +if len(sys.argv) < 3: + print ("Usage: ./map_scores_to_dois.py <...etc...>") + sys.exit(-1) + +# Read arguments +synonyms_folder = sys.argv[1] +num_partitions = int(sys.argv[2]) +input_file_list = [argument for argument in sys.argv[3:]] +input_file_list = [clean_directory_name(item) for item in input_file_list] + +# Prepare output specific variables +output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list] +output_file_list = [item + ".gz" if not item.endswith(".gz") else item for item in output_file_list] + +# --- INFO MESSAGES --- # +print ("\n\n----------------------------") +print ("Mpping openaire ids to DOIs") +print ("Reading input from: " + synonyms_folder) +print ("Num partitions: " + str(num_partitions)) +print ("Input files:" + " -- ".join(input_file_list)) +print ("Output files: " + " -- ".join(output_file_list)) +print ("----------------------------\n\n") +####################################################################################### +# We weill define the following schemas: +# --> the schema of the openaire - doi mapping file [string - int - doi_list] (the separator of the doi-list is a non printable character) +# --> a schema for floating point ranking scores [string - float - string] (the latter string is the class) +# --> a schema for integer ranking scores [string - int - string] (the latter string is the class) + +float_schema = StructType([ + StructField('id', StringType(), False), + StructField('score', FloatType(), False), + StructField('class', StringType(), False) + ]) + +int_schema = StructType([ + StructField('id', StringType(), False), + StructField('score', IntegerType(), False), + StructField('class', StringType(), False) + ]) + +# This schema concerns the output of the file +# containing the number of references of each doi +synonyms_schema = StructType([ + StructField('id', StringType(), False), + StructField('num_synonyms', IntegerType(), False), + StructField('doi_list', StringType(), False), + ]) +####################################################################################### +# Start spark session +spark = SparkSession.builder.appName('Map openaire scores to DOIs').getOrCreate() +# Set Log Level for spark session +spark.sparkContext.setLogLevel('WARN') +####################################################################################### +# MAIN Program + +# Read and repartition the synonym folder - also cache it since we will need to perform multiple joins +synonym_df = spark.read.schema(synonyms_schema).option('delimiter', '\t').csv(synonyms_folder) +synonym_df = synonym_df.select('id', F.split(F.col('doi_list'), chr(0x02)).alias('doi_list')) +synonym_df = synonym_df.select('id', F.explode('doi_list').alias('doi')).repartition(num_partitions, 'id').cache() + +# TESTING +# print ("Synonyms: " + str(synonym_df.count())) +# print ("DF looks like this:" ) +# synonym_df.show(1000, False) + +print ("\n\n-----------------------------") +# Now we need to join the score files on the openaire-id with the synonyms and then keep +# only doi - score - class and write this to the output +for offset, input_file in enumerate(input_file_list): + + print ("Mapping scores from " + input_file) + + # Select correct schema + schema = int_schema + if "attrank" in input_file.lower() or "pr" in input_file.lower() or "ram" in input_file.lower(): + schema = float_schema + + # Load file to dataframe + ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id') + + # TESTING + # print ("Loaded df sample:") + # ranking_df.show(1000, False) + + # Join scores to synonyms and keep required fields + doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'class').repartition(num_partitions, 'doi').cache() + # Write output + output_file = output_file_list[offset] + print ("Writing to: " + output_file) + doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip') + # Free memory? + ranking_df.unpersist(True) + +print ("-----------------------------") +print ("\n\nFinished!\n\n") + + + + + + + + diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml new file mode 100644 index 000000000..807c32063 --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml @@ -0,0 +1,600 @@ + + + + + + + + + + + ${resume eq "rankings-start"} + ${resume eq "impulse"} + ${resume eq "rankings-iterative"} + ${resume eq "format-results"} + ${resume eq "map-ids"} + ${resume eq "map-scores"} + ${resume eq "start"} + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + + + + + + yarn-cluster + cluster + + + Openaire Ranking Graph Creation + + create_openaire_ranking_graph.py + + --executor-memory 20G --executor-cores 4 --driver-memory 20G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + + ${openaireDataInput} + + ${currentYear} + + 7680 + + ${openaireGraphInputPath} + + ${wfAppPath}/create_openaire_ranking_graph.py#create_openaire_ranking_graph.py + + + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + + yarn-cluster + cluster + + + Spark CC + + CC.py + + --executor-memory 18G --executor-cores 4 --driver-memory 10G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + ${openaireGraphInputPath} + + 7680 + + ${wfAppPath}/CC.py#CC.py + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + + yarn-cluster + cluster + + + Spark RAM + + TAR.py + + --executor-memory 18G --executor-cores 4 --driver-memory 10G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + ${openaireGraphInputPath} + ${ramGamma} + ${currentYear} + RAM + + 7680 + ${γιτ α} + + ${wfAppPath}/TAR.py#TAR.py + + + + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + + yarn-cluster + cluster + + + Spark Impulse + + CC.py + + --executor-memory 18G --executor-cores 4 --driver-memory 10G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + ${openaireGraphInputPath} + + 7680 + 3 + + ${wfAppPath}/CC.py#CC.py + + + + + + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + + + + + + + + + yarn-cluster + cluster + + + Spark Pagerank + + PageRank.py + + --executor-memory 18G --executor-cores 4 --driver-memory 10G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + ${openaireGraphInputPath} + ${pageRankAlpha} + ${convergenceError} + ${checkpointDir} + + 7680 + dfs + + ${wfAppPath}/PageRank.py#PageRank.py + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + yarn-cluster + cluster + + + Spark AttRank + + AttRank.py + + --executor-memory 18G --executor-cores 4 --driver-memory 10G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + ${openaireGraphInputPath} + ${attrankAlpha} + ${attrankBeta} + ${attrankGamma} + ${attrankRho} + ${currentYear} + ${attrankStartYear} + ${convergenceError} + ${checkpointDir} + + 7680 + dfs + + ${wfAppPath}/AttRank.py#AttRank.py + + + + + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + /usr/bin/bash + + get_ranking_files.sh + + /${workflowDataDir} + + + ${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh + + + + + + + + + + + + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + yarn-cluster + cluster + + + Format Ranking Results JSON + + format_ranking_results.py + + --executor-memory 10G --executor-cores 4 --driver-memory 10G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + json + + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + + 7680 + + openaire + + ${wfAppPath}/format_ranking_results.py#format_ranking_results.py + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + yarn-cluster + cluster + + + Format Ranking Results BiP! DB + + format_ranking_results.py + + --executor-memory 10G --executor-cores 4 --driver-memory 10G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + zenodo + + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + + 7680 + + openaire + + ${wfAppPath}/format_ranking_results.py#format_ranking_results.py + + + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + + + + + + yarn-cluster + cluster + + + Openaire-DOI synonym collection + + map_openaire_ids_to_dois.py + + --executor-memory 18G --executor-cores 4 --driver-memory 15G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + ${openaireDataInput} + + ${synonymFolder} + + ${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py + + + + + + + + + + + + + + + + ${jobTracker} + + ${nameNode} + + + + yarn-cluster + cluster + + + Mapping Openaire Scores to DOIs + + map_scores_to_dois.py + + --executor-memory 18G --executor-cores 4 --driver-memory 15G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + ${synonymFolder} + + 7680 + + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + + + ${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py + + + + + + + + + + + + + + + + + + PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + From b5c252865c15605e8b6ff154891d82a7544763d8 Mon Sep 17 00:00:00 2001 From: ikanellos Date: Mon, 20 Mar 2023 15:38:36 +0200 Subject: [PATCH 03/41] Add filtering based on citation source --- .../main/resources/create_openaire_ranking_graph.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py index 4cffa86a3..cda12a77c 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py @@ -126,12 +126,19 @@ oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').disti # Collect only valid citations i.e., invisible = false & deletedbyinference=false cites_df = spark.read.json(graph_folder + "/relation")\ - .select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\ + .select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'collectedfrom.value', 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\ .where( (F.col('relClass') == "Cites") \ & (F.col('dataInfo.deletedbyinference') == "false")\ & (F.col('dataInfo.invisible') == "false"))\ .drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\ - .repartition(num_partitions, 'citing').drop('relClass') + .repartition(num_partitions, 'citing').drop('relClass')\ + .withColumn('collected_lower', F.expr('transform(collectedfrom.value, x -> lower(x))'))\ + .drop('collectedfrom.value')\ + .where( + (F.array_contains(F.col('collected_lower'), "opencitations")) + | (F.array_contains(F.col('collected_lower'), "crossref")) + | (F.array_contains(F.col('collected_lower'), "mag")) + ).drop('collected_lower') # print ("Cited df has: " + str(cites_df.count()) + " entries") # DEPRECATED From 9dc8f0f05f2d527bccbde92680f864dbb635710f Mon Sep 17 00:00:00 2001 From: ikanellos Date: Tue, 21 Mar 2023 16:14:15 +0200 Subject: [PATCH 04/41] Add ActionSet step --- .../src/main/resources/job.properties | 6 +++ .../src/main/resources/workflow.xml | 51 ++++++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties index 9ad9def21..a902c413f 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties @@ -63,6 +63,9 @@ oozieWorkflowPath=user/ilias.kanellos/workflow_example/ # The directory where the workflow data is/should be stored workflowDataDir=user/ilias.kanellos/ranking_workflow +# Directory where json data containing scores will be output +bipScorePath=${workflowDataDir}/openaire_universe_scores/ + # Directory where dataframes are checkpointed checkpointDir=${nameNode}/${workflowDataDir}/check/ @@ -84,3 +87,6 @@ wfAppPath=${nameNode}/${oozieWorkflowPath} # The following is needed as a property of a workflow oozie.wf.application.path=${wfAppPath} +# Path where the final output should be? +actionSetOutputPath=${workflowDataDir}/bip_actionsets/ + diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml index 807c32063..d99dc16a2 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml @@ -552,11 +552,50 @@ - + - + + + + + + + + + + + + + + + + yarn + cluster + Produces the atomic action with the bip finder scores for publications + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${bipScorePath} + --outputPath${actionSetOutputPath} + + + + + @@ -597,4 +636,12 @@ Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + From f992ecb6573b507351773096af78d65faef1baac Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 21 Mar 2023 18:03:55 +0200 Subject: [PATCH 05/41] Checkout BIP-Ranker during 'prepare-package' && add it in the oozie-package.tar.gz --- dhp-workflows/dhp-impact-indicators/README.md | 14 ++++++-------- dhp-workflows/dhp-impact-indicators/pom.xml | 19 +++++++++++++++---- .../create_openaire_ranking_graph.py | 0 .../format_ranking_results.py | 0 .../{ => eu.dnetlib}/get_ranking_files.sh | 0 .../resources/{ => eu.dnetlib}/job.properties | 0 .../map_openaire_ids_to_dois.py | 0 .../{ => eu.dnetlib}/map_scores_to_dois.py | 0 .../resources/{ => eu.dnetlib}/workflow.xml | 0 dhp-workflows/pom.xml | 1 + 10 files changed, 22 insertions(+), 12 deletions(-) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/create_openaire_ranking_graph.py (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/format_ranking_results.py (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/get_ranking_files.sh (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/job.properties (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/map_openaire_ids_to_dois.py (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/map_scores_to_dois.py (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/workflow.xml (100%) diff --git a/dhp-workflows/dhp-impact-indicators/README.md b/dhp-workflows/dhp-impact-indicators/README.md index 14f489da3..45a4701e7 100644 --- a/dhp-workflows/dhp-impact-indicators/README.md +++ b/dhp-workflows/dhp-impact-indicators/README.md @@ -1,4 +1,4 @@ -# Ranking Workflow for Openaire Publications +# Ranking Workflow for OpenAIRE Publications This project contains the files for running a paper ranking workflow on the openaire graph using apache oozie. All scripts are written in python and the project setup follows the typical oozie workflow structure: @@ -7,17 +7,15 @@ All scripts are written in python and the project setup follows the typical oozi - a job.properties file specifying parameter values for the parameters used by the workflow - a set of python scripts used by the workflow -**NOTE**: the workflow depends on the external library of ranking scripts called BiP! Ranker. +**NOTE**: the workflow depends on the external library of ranking scripts called [BiP! Ranker](https://github.com/athenarc/Bip-Ranker). You can check out a specific tag/release of BIP! Ranker using maven, as described in the following section. -## Check out a specific tag/release of BIP-Ranker +## Build and deploy -* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out. - -* Then, use maven to perform the checkout: +Use the following command for packaging: ``` -mvn scm:checkout +mvn package -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests ``` -* The code should be visible under `src/main/bip-ranker` folder. \ No newline at end of file +Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use. diff --git a/dhp-workflows/dhp-impact-indicators/pom.xml b/dhp-workflows/dhp-impact-indicators/pom.xml index b510635a6..644b82c7b 100644 --- a/dhp-workflows/dhp-impact-indicators/pom.xml +++ b/dhp-workflows/dhp-impact-indicators/pom.xml @@ -5,9 +5,8 @@ 4.0.0 eu.dnetlib.dhp - dhp + dhp-workflows 1.2.5-SNAPSHOT - ../pom.xml dhp-impact-indicators @@ -16,6 +15,9 @@ 8 8 UTF-8 + + + v1.0.0 @@ -32,9 +34,18 @@ connection tag - v1.0.0 - ${project.build.directory}/../src/main/bip-ranker + ${bip.ranker.tag} + ${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/bip-ranker + + + checkout-bip-ranker + prepare-package + + checkout + + + diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/format_ranking_results.py similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/format_ranking_results.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/get_ranking_files.sh similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/get_ranking_files.sh diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_openaire_ids_to_dois.py similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_openaire_ids_to_dois.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_scores_to_dois.py similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_scores_to_dois.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 541d59007..d054ba39b 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -38,6 +38,7 @@ dhp-usage-raw-data-update dhp-broker-events dhp-doiboost + dhp-impact-indicators From 3e8a4cf9521fdab068e47f48536e707d14f0ea18 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 21 Mar 2023 18:24:12 +0200 Subject: [PATCH 06/41] Rearrange resources folder structure --- .../create_openaire_ranking_graph.py | 11 +++- .../oozie_app}/format_ranking_results.py | 0 .../oozie_app}/get_ranking_files.sh | 0 .../oozie_app}/job.properties | 6 +++ .../oozie_app}/map_openaire_ids_to_dois.py | 0 .../oozie_app}/map_scores_to_dois.py | 0 .../impact_indicators/oozie_app}/workflow.xml | 51 ++++++++++++++++++- 7 files changed, 64 insertions(+), 4 deletions(-) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/create_openaire_ranking_graph.py (95%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/format_ranking_results.py (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/get_ranking_files.sh (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/job.properties (93%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/map_openaire_ids_to_dois.py (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/map_scores_to_dois.py (100%) rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/workflow.xml (93%) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py similarity index 95% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py index 4cffa86a3..cda12a77c 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py @@ -126,12 +126,19 @@ oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').disti # Collect only valid citations i.e., invisible = false & deletedbyinference=false cites_df = spark.read.json(graph_folder + "/relation")\ - .select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\ + .select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'collectedfrom.value', 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\ .where( (F.col('relClass') == "Cites") \ & (F.col('dataInfo.deletedbyinference') == "false")\ & (F.col('dataInfo.invisible') == "false"))\ .drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\ - .repartition(num_partitions, 'citing').drop('relClass') + .repartition(num_partitions, 'citing').drop('relClass')\ + .withColumn('collected_lower', F.expr('transform(collectedfrom.value, x -> lower(x))'))\ + .drop('collectedfrom.value')\ + .where( + (F.array_contains(F.col('collected_lower'), "opencitations")) + | (F.array_contains(F.col('collected_lower'), "crossref")) + | (F.array_contains(F.col('collected_lower'), "mag")) + ).drop('collected_lower') # print ("Cited df has: " + str(cites_df.count()) + " entries") # DEPRECATED diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/format_ranking_results.py rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/get_ranking_files.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_ranking_files.sh similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/get_ranking_files.sh rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_ranking_files.sh diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties similarity index 93% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties index 9ad9def21..a902c413f 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties @@ -63,6 +63,9 @@ oozieWorkflowPath=user/ilias.kanellos/workflow_example/ # The directory where the workflow data is/should be stored workflowDataDir=user/ilias.kanellos/ranking_workflow +# Directory where json data containing scores will be output +bipScorePath=${workflowDataDir}/openaire_universe_scores/ + # Directory where dataframes are checkpointed checkpointDir=${nameNode}/${workflowDataDir}/check/ @@ -84,3 +87,6 @@ wfAppPath=${nameNode}/${oozieWorkflowPath} # The following is needed as a property of a workflow oozie.wf.application.path=${wfAppPath} +# Path where the final output should be? +actionSetOutputPath=${workflowDataDir}/bip_actionsets/ + diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_openaire_ids_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_openaire_ids_to_dois.py rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_scores_to_dois.py rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml similarity index 93% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 807c32063..d99dc16a2 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -552,11 +552,50 @@ - + - + + + + + + + + + + + + + + + + yarn + cluster + Produces the atomic action with the bip finder scores for publications + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${bipScorePath} + --outputPath${actionSetOutputPath} + + + + + @@ -597,4 +636,12 @@ Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + From 102aa5ab81bf2acf6b758b0255d4383f050d31d6 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 21 Mar 2023 19:25:29 +0200 Subject: [PATCH 07/41] Add dependency to dhp-aggregation --- dhp-workflows/dhp-impact-indicators/pom.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dhp-workflows/dhp-impact-indicators/pom.xml b/dhp-workflows/dhp-impact-indicators/pom.xml index 644b82c7b..a9eb0a4a1 100644 --- a/dhp-workflows/dhp-impact-indicators/pom.xml +++ b/dhp-workflows/dhp-impact-indicators/pom.xml @@ -49,4 +49,14 @@ + + + + eu.dnetlib.dhp + dhp-aggregation + ${projectVersion} + compile + + + \ No newline at end of file From 7256c8d3c71c632ae0537e2c5ce585da738662b5 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Fri, 7 Apr 2023 16:30:12 +0300 Subject: [PATCH 08/41] Add script for aggregating impact indicators at the project level --- .../oozie_app/job.properties | 3 + .../oozie_app/projects_impact.py | 109 ++++++++++++++++++ .../impact_indicators/oozie_app/workflow.xml | 70 ++++++++++- 3 files changed, 176 insertions(+), 6 deletions(-) create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties index a902c413f..f9f5519cc 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties @@ -90,3 +90,6 @@ oozie.wf.application.path=${wfAppPath} # Path where the final output should be? actionSetOutputPath=${workflowDataDir}/bip_actionsets/ +# The directory to store project impact indicators +projectImpactIndicatorsOutput=${workflowDataDir}/project_indicators + diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py new file mode 100644 index 000000000..f01c92a0d --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py @@ -0,0 +1,109 @@ +import sys +from pyspark.sql import SparkSession +from pyspark import SparkConf, SparkContext +import pyspark.sql.functions as F +from pyspark.sql.types import StringType, IntegerType, StructType, StructField + +if len(sys.argv) < 8: + print("Usage: projects_impact.py ") + sys.exit(-1) + +appName = 'Project Impact Indicators' +conf = SparkConf().setAppName(appName) +sc = SparkContext(conf = conf) +spark = SparkSession.builder.appName(appName).getOrCreate() +sc.setLogLevel('OFF') + +# input parameters +relations_fd = sys.argv[1] +influence_fd = sys.argv[2] +popularity_fd = sys.argv[3] +cc_fd = sys.argv[4] +impulse_fd = sys.argv[5] +num_partitions = int(sys.argv[6]) +output_dir = sys.argv[7] + +# schema for impact indicator files +impact_files_schema = StructType([ + StructField('resultId', StringType(), False), + StructField('score', IntegerType(), False), + StructField('class', StringType(), False), +]) + +# list of impact indicators +impact_indicators = [ + ('influence', influence_fd, 'class'), + ('popularity', popularity_fd, 'class'), + ('impulse', impulse_fd, 'score'), + ('citation_count', cc_fd, 'score') +] + +''' + * Read impact indicator file and return a dataframe with the following schema: + * resultId: String + * indicator_name: Integer +''' +def read_df(fd, indicator_name, column_name): + return spark.read.schema(impact_files_schema)\ + .option('delimiter', '\t')\ + .option('header', False)\ + .csv(fd)\ + .select('resultId', F.col(column_name).alias(indicator_name))\ + .repartition(num_partitions, 'resultId') + +# Print dataframe schema, first 5 rows, and count +def print_df(df): + df.show(50) + df.printSchema() + print(df.count()) + +# Sets a null value to the column if the value is equal to the given value +def set_class_value_to_null(column, value): + return F.when(column != value, column).otherwise(F.lit(None)) + +# load and filter Project-to-Result relations +print("Reading relations") +relations = spark.read.json(relations_fd)\ + .select(F.col('source').alias('projectId'), F.col('target').alias('resultId'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\ + .where( (F.col('relClass') == 'produces') \ + & (F.col('deletedbyinference') == "false")\ + & (F.col('invisible') == "false"))\ + .drop('deletedbyinference')\ + .drop('invisible')\ + .drop('relClass')\ + .repartition(num_partitions, 'resultId') + +for indicator_name, fd, column_name in impact_indicators: + + print("Reading {} '{}' field from file".format(indicator_name, column_name)) + df = read_df(fd, indicator_name, column_name) + + # sets a zero value to the indicator column if the value is C5 + if (column_name == 'class'): + df = df.withColumn(indicator_name, F.when(F.col(indicator_name).isin("C5"), 0).otherwise(1)) + + # print_df(df) + + print("Joining {} to relations".format(indicator_name)) + + # NOTE: we use inner join because we want to keep only the results that have an impact score + # also note that all impact scores have the same set of results + relations = relations.join(df, 'resultId', 'inner')\ + .repartition(num_partitions, 'resultId') + +# uncomment to print non-null values count for each indicator +# for indicator_name, fd, column_name in impact_indicators: +# print("Counting non null values for {}".format(indicator_name)) +# print(relations.filter(F.col(indicator_name).isNotNull()).count()) + +sum the impact indicator values for each project +relations.groupBy('projectId')\ + .agg(\ + F.sum('influence').alias('influence'),\ + F.sum('popularity').alias('popularity'),\ + F.sum('impulse').alias('impulse'),\ + F.sum('citation_count').alias('citation_count')\ + )\ + .write.mode("overwrite")\ + .option("delimiter", "\t")\ + .csv(output_dir, compression="gzip") \ No newline at end of file diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index d99dc16a2..8cd0b0d5d 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -15,6 +15,8 @@ ${resume eq "map-ids"} ${resume eq "map-scores"} ${resume eq "start"} + ${resume eq "projects-impact"} + @@ -334,7 +336,7 @@ ${nameNode} - + /usr/bin/bash get_ranking_files.sh @@ -558,7 +560,7 @@ - + @@ -592,11 +594,63 @@ --inputPath${bipScorePath} --outputPath${actionSetOutputPath} - + - - + + + + + + + ${jobTracker} + + ${nameNode} + + yarn-cluster + cluster + + + Spark Pagerank + + PageRank.py + + --executor-memory 18G --executor-cores 4 --driver-memory 10G + --master yarn + --deploy-mode cluster + --conf spark.sql.shuffle.partitions=7680 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + + + + ${openaireDataInput}/relations + + + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} + + + 7680 + + ${projectImpactIndicatorsOutput} + + + ${wfAppPath}/projects_impact.py#projects_impact.py + + + + + + + + + @@ -642,6 +696,10 @@ ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + + + Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + From 23f58a86f177ac7fcbef5b3d5bff28e654299f07 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 18 Apr 2023 12:26:01 +0300 Subject: [PATCH 09/41] Change jar param in project impact indicators action --- .../dhp/oa/graph/impact_indicators/oozie_app/workflow.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 8cd0b0d5d..ac44d5c05 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -611,9 +611,9 @@ cluster - Spark Pagerank + Project Impact Indicators - PageRank.py + projects_impact.py --executor-memory 18G --executor-cores 4 --driver-memory 10G --master yarn From ee04cf92bf4030f9be3b4a34703198c3dd5ce424 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Wed, 26 Apr 2023 20:23:46 +0300 Subject: [PATCH 10/41] Add actionsets for project impact indicators --- .../bipfinder/SparkAtomicActionScoreJob.java | 63 +++++++---- .../score/deserializers/BipProjectModel.java | 69 ++++++++++++ .../deserializers/BipResultModel.java} | 8 +- .../PrepareBipFinder.java | 6 +- .../bipfinder/input_actionset_parameter.json | 6 ++ .../SparkAtomicActionScoreJobTest.java | 102 ++++++++++++++---- .../bipfinder/project_bip_scores.json | 4 + ...scores_oid.json => result_bip_scores.json} | 0 .../oozie_app/projects_impact.py | 13 ++- 9 files changed, 218 insertions(+), 53 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/{BipDeserialize.java => score/deserializers/BipResultModel.java} (65%) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/{bip_scores_oid.json => result_bip_scores.json} (100%) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java index ddf5f4adf..13ce1440a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java @@ -9,6 +9,7 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; +import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; @@ -24,7 +25,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize; +import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel; import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; @@ -56,18 +57,17 @@ public class SparkAtomicActionScoreJob implements Serializable { parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String inputPath = parser.get("inputPath"); - log.info("inputPath {}: ", inputPath); + log.info("inputPath: {}", inputPath); final String outputPath = parser.get("outputPath"); - log.info("outputPath {}: ", outputPath); + log.info("outputPath: {}", outputPath); + + final String targetEntity = parser.get("targetEntity"); + log.info("targetEntity: {}", targetEntity); SparkConf conf = new SparkConf(); @@ -76,17 +76,48 @@ public class SparkAtomicActionScoreJob implements Serializable { isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - prepareResults(spark, inputPath, outputPath); - }); + + // follow different procedures for different target entities + switch (targetEntity) { + case "result": + prepareResults(spark, inputPath, outputPath); + break; + case "project": + prepareProjects(spark, inputPath, outputPath); + break; + default: + throw new RuntimeException("Unknown target entity: " + targetEntity); + } + } + ); + } + + private static void prepareProjects(SparkSession spark, String inputPath, String outputPath) { + + // read input bip project scores + Dataset projectScores = readPath(spark, inputPath, BipProjectModel.class); + + projectScores.map( (MapFunction) bipProjectScores -> { + Project project = new Project(); + project.setId(bipProjectScores.getProjectId()); + project.setMeasures(bipProjectScores.toMeasures()); + return project; + }, Encoders.bean(Project.class)) + .toJavaRDD() + .map(p -> new AtomicAction(Project.class, p)) + .mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + } private static void prepareResults(SparkSession spark, String bipScorePath, String outputPath) { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD bipDeserializeJavaRDD = sc + JavaRDD bipDeserializeJavaRDD = sc .textFile(bipScorePath) - .map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class)); + .map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class)); Dataset bipScores = spark .createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> { @@ -159,12 +190,4 @@ public class SparkAtomicActionScoreJob implements Serializable { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - public static Dataset readPath( - SparkSession spark, String inputPath, Class clazz) { - return spark - .read() - .textFile(inputPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); - } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java new file mode 100644 index 000000000..77c1567a8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java @@ -0,0 +1,69 @@ +package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers; + +import com.opencsv.bean.CsvBindByPosition; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import eu.dnetlib.dhp.schema.oaf.Measure; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static eu.dnetlib.dhp.actionmanager.Constants.*; + +@NoArgsConstructor +@AllArgsConstructor +@Getter +@Setter +public class BipProjectModel { + String projectId; + + String numOfInfluentialResults; + + String numOfPopularResults; + + String totalImpulse; + + String totalCitationCount; + + // each project bip measure has exactly one value, hence one key-value pair + private Measure createMeasure(String measureId, String measureValue) { + + KeyValue kv = new KeyValue(); + kv.setKey("score"); + kv.setValue(measureValue); + kv.setDataInfo( + OafMapperUtils.dataInfo( + false, + UPDATE_DATA_INFO_TYPE, + true, + false, + OafMapperUtils.qualifier( + UPDATE_MEASURE_BIP_CLASS_ID, + UPDATE_CLASS_NAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "") + ); + + Measure measure = new Measure(); + measure.setId(measureId); + measure.setUnit(Collections.singletonList(kv)); + return measure; + } + public List toMeasures() { + return Arrays.asList( + createMeasure("numOfInfluentialResults", numOfInfluentialResults), + createMeasure("numOfPopularResults", numOfPopularResults), + createMeasure("totalImpulse", totalImpulse), + createMeasure("totalCitationCount", totalCitationCount) + ); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java similarity index 65% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java index a70bca618..06a173413 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java @@ -1,5 +1,7 @@ -package eu.dnetlib.dhp.actionmanager.bipmodel; +package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers; + +import eu.dnetlib.dhp.actionmanager.bipmodel.Score; import java.io.Serializable; import java.util.ArrayList; @@ -11,9 +13,9 @@ import java.util.List; * Only needed for deserialization purposes */ -public class BipDeserialize extends HashMap> implements Serializable { +public class BipResultModel extends HashMap> implements Serializable { - public BipDeserialize() { + public BipResultModel() { super(); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java index 80573c71a..efcb96a85 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java @@ -24,7 +24,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize; +import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel; import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; @@ -82,9 +82,9 @@ public class PrepareBipFinder implements Serializable { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD bipDeserializeJavaRDD = sc + JavaRDD bipDeserializeJavaRDD = sc .textFile(inputPath) - .map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class)); + .map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class)); spark .createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> { diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json index 7663a454b..d6b93c5af 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json @@ -16,5 +16,11 @@ "paramLongName": "outputPath", "paramDescription": "the path of the new ActionSet", "paramRequired": true + }, + { + "paramName": "te", + "paramLongName": "targetEntity", + "paramDescription": "the type of target entity to be enriched; currently supported one of { 'result', 'project' }", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java index be82b9fc3..aa5a19f11 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java @@ -6,8 +6,9 @@ import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Project; import org.apache.commons.io.FileUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -27,7 +28,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Result; public class SparkAtomicActionScoreJobTest { @@ -37,8 +37,11 @@ public class SparkAtomicActionScoreJobTest { private static SparkSession spark; private static Path workingDir; - private static final Logger log = LoggerFactory - .getLogger(SparkAtomicActionScoreJobTest.class); + + private final static String RESULT = "result"; + private final static String PROJECT = "project"; + + private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJobTest.class); @BeforeAll public static void beforeAll() throws IOException { @@ -69,29 +72,31 @@ public class SparkAtomicActionScoreJobTest { spark.stop(); } + private void runJob(String inputPath, String outputPath, String targetEntity) throws Exception { + SparkAtomicActionScoreJob.main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-inputPath", inputPath, + "-outputPath", outputPath, + "-targetEntity", targetEntity, + } + ); + } @Test - void testMatch() throws Exception { - String bipScoresPath = getClass() - .getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json") + void testResultScores() throws Exception { + final String targetEntity = RESULT; + String inputResultScores = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json") .getPath(); + String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet"; - SparkAtomicActionScoreJob - .main( - new String[] { - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-inputPath", - - bipScoresPath, - - "-outputPath", - workingDir.toString() + "/actionSet" - }); + // execute the job to generate the action sets for result scores + runJob(inputResultScores, outputPath, targetEntity); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD tmp = sc - .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .sequenceFile(outputPath, Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Result) aa.getPayload())); @@ -140,4 +145,61 @@ public class SparkAtomicActionScoreJobTest { } + @Test + void testProjectScores() throws Exception { + String targetEntity = PROJECT; + String inputResultScores = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json") + .getPath(); + String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet"; + + // execute the job to generate the action sets for project scores + runJob(inputResultScores, outputPath, PROJECT); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD projects = sc + .sequenceFile(outputPath, Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Project) aa.getPayload())); + + // test the number of projects + assertEquals(4, projects.count()); + + String testProjectId = "40|nih_________::c02a8233e9b60f05bb418f0c9b714833"; + + // count that the project with id testProjectId is present + assertEquals(1, projects.filter(row -> row.getId().equals(testProjectId)).count()); + + projects.filter(row -> row.getId().equals(testProjectId)) + .flatMap(r -> r.getMeasures().iterator()) + .foreach(m -> { + log.info(m.getId() + " " + m.getUnit()); + + // ensure that only one score is present for each bip impact measure + assertEquals(1, m.getUnit().size()); + + KeyValue kv = m.getUnit().get(0); + + // ensure that the correct key is provided, i.e. score + assertEquals("score", kv.getKey()); + + switch(m.getId()) { + case "numOfInfluentialResults": + assertEquals("0", kv.getValue()); + break; + case "numOfPopularResults": + assertEquals("1", kv.getValue()); + break; + case "totalImpulse": + assertEquals("25", kv.getValue()); + break; + case "totalCitationCount": + assertEquals("43", kv.getValue()); + break; + default: + fail("Unknown measure id in the context of projects"); + } + }); + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json new file mode 100644 index 000000000..096268287 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json @@ -0,0 +1,4 @@ +{"projectId":"40|nsf_________::d93e50d22374a1cf59f6a232413ea027","numOfInfluentialResults":0,"numOfPopularResults":10,"totalImpulse":181,"totalCitationCount":235} +{"projectId":"40|nih_________::1c93debc7085e440f245fbe70b2e8b21","numOfInfluentialResults":14,"numOfPopularResults":17,"totalImpulse":1558,"totalCitationCount":4226} +{"projectId":"40|nih_________::c02a8233e9b60f05bb418f0c9b714833","numOfInfluentialResults":0,"numOfPopularResults":1,"totalImpulse":25,"totalCitationCount":43} +{"projectId":"40|corda_______::d91dcf3a87dd7f72248fab0b8a4ba273","numOfInfluentialResults":2,"numOfPopularResults":3,"totalImpulse":78,"totalCitationCount":178} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py index f01c92a0d..d60f86e88 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py @@ -96,14 +96,13 @@ for indicator_name, fd, column_name in impact_indicators: # print("Counting non null values for {}".format(indicator_name)) # print(relations.filter(F.col(indicator_name).isNotNull()).count()) -sum the impact indicator values for each project +# sum the impact indicator values for each project relations.groupBy('projectId')\ .agg(\ - F.sum('influence').alias('influence'),\ - F.sum('popularity').alias('popularity'),\ - F.sum('impulse').alias('impulse'),\ - F.sum('citation_count').alias('citation_count')\ + F.sum('influence').alias('numOfInfluentialResults'),\ + F.sum('popularity').alias('numOfPopularResults'),\ + F.sum('impulse').alias('totalImpulse'),\ + F.sum('citation_count').alias('totalCitationCount')\ )\ .write.mode("overwrite")\ - .option("delimiter", "\t")\ - .csv(output_dir, compression="gzip") \ No newline at end of file + .json(output_dir, compression="gzip") \ No newline at end of file From 815a4ddbbaf6fa68a23d576189db2ee03f97f828 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Wed, 26 Apr 2023 20:40:06 +0300 Subject: [PATCH 11/41] Add actionset creation for project bip indicators in workflow --- .../bipfinder/SparkAtomicActionScoreJob.java | 7 +- .../impact_indicators/oozie_app/workflow.xml | 81 +++++++++++-------- 2 files changed, 53 insertions(+), 35 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java index 13ce1440a..8b8e05723 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java @@ -41,7 +41,8 @@ import scala.Tuple2; */ public class SparkAtomicActionScoreJob implements Serializable { - private static final String DOI = "doi"; + private static final String RESULT = "result"; + private static final String PROJECT = "project"; private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -79,10 +80,10 @@ public class SparkAtomicActionScoreJob implements Serializable { // follow different procedures for different target entities switch (targetEntity) { - case "result": + case RESULT: prepareResults(spark, inputPath, outputPath); break; - case "project": + case PROJECT: prepareProjects(spark, inputPath, outputPath); break; default: diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index ac44d5c05..c77443bd9 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -34,7 +34,6 @@ - yarn-cluster @@ -90,9 +89,8 @@ ${nameNode} - - - yarn-cluster + + yarn-cluster cluster @@ -131,7 +129,6 @@ ${jobTracker} ${nameNode} - yarn-cluster @@ -181,9 +178,8 @@ ${nameNode} - - - yarn-cluster + + yarn-cluster cluster @@ -235,7 +231,7 @@ - + yarn-cluster @@ -336,12 +332,12 @@ ${nameNode} - - /usr/bin/bash - - get_ranking_files.sh - - /${workflowDataDir} + + /usr/bin/bash + + get_ranking_files.sh + + /${workflowDataDir} ${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh @@ -374,8 +370,8 @@ ${nameNode} - - yarn-cluster + + yarn-cluster cluster @@ -422,8 +418,8 @@ ${nameNode} - - yarn-cluster + + yarn-cluster cluster @@ -476,7 +472,6 @@ - yarn-cluster @@ -520,7 +515,6 @@ ${nameNode} - yarn-cluster cluster @@ -564,17 +558,12 @@ - - + - - + yarn cluster @@ -593,12 +582,12 @@ --inputPath${bipScorePath} --outputPath${actionSetOutputPath} - + --targetEntityresult + - @@ -645,13 +634,38 @@ - + + + + yarn + cluster + Produces the atomic action with the bip finder scores for projects + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${projectImpactIndicatorsOutput} + --outputPath${actionSetOutputPath} + --targetEntityproject + + + + + @@ -695,11 +709,14 @@ - ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + ActionSet creation for results failed, error message[${wf:errorMessage(wf:lastErrorNode())}] Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + From 614cc1089b975f8dc05df4f671029b5bdaa31d44 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Thu, 27 Apr 2023 12:37:15 +0300 Subject: [PATCH 12/41] Add separate forder for results && project actionsets --- .../graph/impact_indicators/oozie_app/workflow.xml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index c77443bd9..5f67bb914 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -556,9 +556,12 @@ - - - + + + + + + @@ -581,7 +584,7 @@ --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --inputPath${bipScorePath} - --outputPath${actionSetOutputPath} + --outputPath${actionSetOutputPath}/results/ --targetEntityresult @@ -659,7 +662,7 @@ --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --inputPath${projectImpactIndicatorsOutput} - --outputPath${actionSetOutputPath} + --outputPath${actionSetOutputPath}/projects/ --targetEntityproject From 09485fbee3d1c782af33756a73b59f53a90532b5 Mon Sep 17 00:00:00 2001 From: ikanellos Date: Fri, 28 Apr 2023 13:09:13 +0300 Subject: [PATCH 13/41] Fixed unicode bug. Workflow ends after first script --- .../dhp-impact-indicators/src/main/resources/workflow.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml index d99dc16a2..a957f6c10 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml @@ -65,7 +65,7 @@ - + @@ -155,7 +155,7 @@ RAM 7680 - ${γιτ α} + ${checkpointDir} ${wfAppPath}/TAR.py#TAR.py From 90332439adc5e1400067fc61cefecbc39c9ab478 Mon Sep 17 00:00:00 2001 From: ikanellos Date: Fri, 28 Apr 2023 13:45:19 +0300 Subject: [PATCH 14/41] Remove deletion of synonym folder --- .../dhp/oa/graph/impact_indicators/oozie_app/workflow.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 815096665..7aa95db22 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -31,9 +31,11 @@ ${nameNode} + yarn-cluster From 3de35fd6a310ca41c8fb7cdd1a1e1396a2067fba Mon Sep 17 00:00:00 2001 From: ikanellos Date: Thu, 11 May 2023 14:42:25 +0300 Subject: [PATCH 15/41] Produce 5 classes of ranking scores --- .../oozie_app/format_ranking_results.py | 31 ++++++++++++++++--- .../impact_indicators/oozie_app/workflow.xml | 2 +- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py index 60c71e52f..e7d62c2f1 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py @@ -421,7 +421,7 @@ elif mode == 'json': # Score-specific dataframe - read inputs pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id') - attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',False).csv(attrank_dir).repartition(num_partitions, 'id') + attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id') cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id') impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id') ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id') @@ -601,7 +601,7 @@ elif mode == 'json-5-way': # Score-specific dataframe - read inputs pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id') - attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',False).csv(attrank_dir).repartition(num_partitions, 'id') + attrank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id') cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id') impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id') ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id') @@ -753,15 +753,36 @@ elif mode == 'json-5-way': # -------------------------------------------- # # Write json output + # -------------------------------------------- # + # Write json output - set the directory here output_dir = "/".join(pagerank_dir.split('/')[:-1]) if graph_type == 'bip': - output_dir = output_dir + '/bip_universe_doi_scores_5_classes/' + output_dir = output_dir + '/bip_universe_doi_scores/' else: - output_dir = output_dir + '/openaire_universe_scores_5_classes/' - + output_dir = output_dir + '/openaire_universe_scores/' + + # Write the dataframe print ("Writing output to: " + output_dir) results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip') + # Rename the files to .json.gz now + sc = spark.sparkContext + URI = sc._gateway.jvm.java.net.URI + Path = sc._gateway.jvm.org.apache.hadoop.fs.Path + FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem + # Get master prefix from input file path + master_prefix = "/".join(pagerank_dir.split('/')[:5]) + fs = FileSystem.get(URI(master_prefix), sc._jsc.hadoopConfiguration()) + path = Path(output_dir) + print ("Path is:" + path.toString()) + file_list = fs.listStatus(Path(output_dir)) + print ("Renaming files:") + for f in file_list: + initial_filename = f.getPath().toString() + if "part" in initial_filename: + print (initial_filename + " => " + initial_filename.replace(".txt.gz", ".json.gz")) + fs.rename(Path(initial_filename), Path(initial_filename.replace(".txt.gz", ".json.gz"))) + # Close spark session spark.stop() diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 7aa95db22..f07a27244 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -390,7 +390,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - json + json-5-way ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} From 5ddbb4ad10f8885e6fdbc9c18e0356c2a25db63a Mon Sep 17 00:00:00 2001 From: ikanellos Date: Thu, 11 May 2023 15:36:47 +0300 Subject: [PATCH 16/41] Spark properties no longer hardcoded --- .../oa/graph/impact_indicators/oozie_app/job.properties | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties index f9f5519cc..7b4bb96cf 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties @@ -13,6 +13,14 @@ maven.executable=mvn sparkDriverMemory=7G sparkExecutorMemory=7G sparkExecutorCores=4 + +# Some memory and driver settings for more demanding tasks +sparkHighDriverMemory=20G +sparkNormalDriverMemory=10G + +sparkHighExecutorMemory=20G +sparkNormalExecutorMemory=10G + # The above is given differently in an example I found online oozie.action.sharelib.for.spark=spark2 oozieActionShareLibForSpark2=spark2 From 1788ac2d4d1403dc8ed4173e9487c9f1a8d1ba4c Mon Sep 17 00:00:00 2001 From: ikanellos Date: Fri, 12 May 2023 12:55:43 +0300 Subject: [PATCH 17/41] Correct filtering for MAG records --- .../oozie_app/create_openaire_ranking_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py index cda12a77c..182fd9309 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py @@ -137,7 +137,7 @@ cites_df = spark.read.json(graph_folder + "/relation")\ .where( (F.array_contains(F.col('collected_lower'), "opencitations")) | (F.array_contains(F.col('collected_lower'), "crossref")) - | (F.array_contains(F.col('collected_lower'), "mag")) + | (F.array_contains(F.col('collected_lower'), "microsoft academic graph")) ).drop('collected_lower') # print ("Cited df has: " + str(cites_df.count()) + " entries") From 07818131ef0067810953c8692d6559c56d25bb48 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 15 May 2023 13:04:44 +0300 Subject: [PATCH 18/41] Update documentation --- dhp-workflows/dhp-impact-indicators/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-impact-indicators/README.md b/dhp-workflows/dhp-impact-indicators/README.md index 45a4701e7..aad94ea19 100644 --- a/dhp-workflows/dhp-impact-indicators/README.md +++ b/dhp-workflows/dhp-impact-indicators/README.md @@ -15,7 +15,12 @@ You can check out a specific tag/release of BIP! Ranker using maven, as describe Use the following command for packaging: ``` -mvn package -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests +mvn package -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests +``` + +Deploy and run: +``` +mvn package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests ``` Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use. From 4a905932a3db36c61570c24b9aa54283cd30abba Mon Sep 17 00:00:00 2001 From: ikanellos Date: Mon, 15 May 2023 15:24:22 +0300 Subject: [PATCH 19/41] Spark properties from job.properties --- .../impact_indicators/oozie_app/workflow.xml | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index f07a27244..ec2bb140f 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -46,7 +46,7 @@ create_openaire_ranking_graph.py - --executor-memory 20G --executor-cores 4 --driver-memory 20G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -100,7 +100,7 @@ CC.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -141,7 +141,7 @@ TAR.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -189,7 +189,7 @@ CC.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -244,7 +244,7 @@ PageRank.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -289,7 +289,7 @@ AttRank.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -381,7 +381,7 @@ format_ranking_results.py - --executor-memory 10G --executor-cores 4 --driver-memory 10G + --executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -429,7 +429,7 @@ format_ranking_results.py - --executor-memory 10G --executor-cores 4 --driver-memory 10G + --executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -484,7 +484,7 @@ map_openaire_ids_to_dois.py - --executor-memory 18G --executor-cores 4 --driver-memory 15G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -526,7 +526,7 @@ map_scores_to_dois.py - --executor-memory 18G --executor-cores 4 --driver-memory 15G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 @@ -609,7 +609,7 @@ projects_impact.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G + --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} --master yarn --deploy-mode cluster --conf spark.sql.shuffle.partitions=7680 From b8e8c959fe5a72f3b88610643b5e229371aa687c Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 15 May 2023 15:50:23 +0300 Subject: [PATCH 20/41] Update workflow.xml && job.properties --- .../oozie_app/job.properties | 46 ++-- .../impact_indicators/oozie_app/workflow.xml | 260 ++++++++++-------- 2 files changed, 163 insertions(+), 143 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties index 7b4bb96cf..08f9b1eac 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties @@ -1,18 +1,16 @@ # The following set of properties are defined in https://support.openaire.eu/projects/openaire/wiki/Hadoop_clusters # and concern the parameterization required for running workflows on the @GARR cluster -dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos -dhp.hadoop.frontend.user.name=ilias.kanellos -dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl -dhp.hadoop.frontend.port.ssh=22 -oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie -jobTracker=yarnRM -nameNode=hdfs://nameservice1 -oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log -maven.executable=mvn -sparkDriverMemory=7G -sparkExecutorMemory=7G -sparkExecutorCores=4 +# --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties --- +# dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos +# dhp.hadoop.frontend.user.name=ilias.kanellos +# dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl +# dhp.hadoop.frontend.port.ssh=22 +# oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie +# jobTracker=yarnRM +# nameNode=hdfs://nameservice1 +# oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log +# maven.executable=mvn # Some memory and driver settings for more demanding tasks sparkHighDriverMemory=20G @@ -21,6 +19,9 @@ sparkNormalDriverMemory=10G sparkHighExecutorMemory=20G sparkNormalExecutorMemory=10G +sparkExecutorCores=4 +sparkShufflePartitions=7680 + # The above is given differently in an example I found online oozie.action.sharelib.for.spark=spark2 oozieActionShareLibForSpark2=spark2 @@ -66,29 +67,26 @@ ramGamma=0.6 convergenceError=0.000000000001 # I think this should be the oozie workflow directory -oozieWorkflowPath=user/ilias.kanellos/workflow_example/ - -# The directory where the workflow data is/should be stored -workflowDataDir=user/ilias.kanellos/ranking_workflow +# oozieWorkflowPath=user/ilias.kanellos/workflow_example/ # Directory where json data containing scores will be output -bipScorePath=${workflowDataDir}/openaire_universe_scores/ +bipScorePath=${workingDir}/openaire_universe_scores/ # Directory where dataframes are checkpointed -checkpointDir=${nameNode}/${workflowDataDir}/check/ +checkpointDir=${nameNode}/${workingDir}/check/ # The directory for the doi-based bip graph -bipGraphFilePath=${nameNode}/${workflowDataDir}/bipdbv8_graph +bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph # The folder from which synonyms of openaire-ids are read # openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/ -openaireDataInput=${/tmp/prod_provision/graph/18_graph_blacklisted} +openaireDataInput=/tmp/prod_provision/graph/18_graph_blacklisted # A folder where we will write the openaire to doi mapping -synonymFolder=${nameNode}/${workflowDataDir}/openaireid_to_dois/ +synonymFolder=${nameNode}/${workingDir}/openaireid_to_dois/ # This will be where we store the openaire graph input. They told us on GARR to use a directory under /data -openaireGraphInputPath=${nameNode}/${workflowDataDir}/openaire_id_graph +openaireGraphInputPath=${nameNode}/${workingDir}/openaire_id_graph # The workflow application path wfAppPath=${nameNode}/${oozieWorkflowPath} @@ -96,8 +94,8 @@ wfAppPath=${nameNode}/${oozieWorkflowPath} oozie.wf.application.path=${wfAppPath} # Path where the final output should be? -actionSetOutputPath=${workflowDataDir}/bip_actionsets/ +actionSetOutputPath=${workingDir}/bip_actionsets/ # The directory to store project impact indicators -projectImpactIndicatorsOutput=${workflowDataDir}/project_indicators +projectImpactIndicatorsOutput=${workingDir}/project_indicators diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index f07a27244..d930ab774 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -46,21 +46,23 @@ create_openaire_ranking_graph.py - --executor-memory 20G --executor-cores 4 --driver-memory 20G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkHighDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireDataInput} ${currentYear} - 7680 + ${sparkShufflePartitions} ${openaireGraphInputPath} @@ -100,18 +102,20 @@ CC.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} - 7680 + ${sparkShufflePartitions} ${wfAppPath}/CC.py#CC.py @@ -141,21 +145,23 @@ TAR.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} ${ramGamma} ${currentYear} RAM - 7680 + ${sparkShufflePartitions} ${checkpointDir} ${wfAppPath}/TAR.py#TAR.py @@ -189,18 +195,20 @@ CC.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} - 7680 + ${sparkShufflePartitions} 3 ${wfAppPath}/CC.py#CC.py @@ -244,21 +252,23 @@ PageRank.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} ${pageRankAlpha} ${convergenceError} ${checkpointDir} - 7680 + ${sparkShufflePartitions} dfs ${wfAppPath}/PageRank.py#PageRank.py @@ -289,14 +299,16 @@ AttRank.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireGraphInputPath} ${attrankAlpha} @@ -308,7 +320,7 @@ ${convergenceError} ${checkpointDir} - 7680 + ${sparkShufflePartitions} dfs ${wfAppPath}/AttRank.py#AttRank.py @@ -339,7 +351,7 @@ get_ranking_files.sh - /${workflowDataDir} + /${workingDir} ${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh @@ -381,24 +393,26 @@ format_ranking_results.py - --executor-memory 10G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkNormalExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + json-5-way - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} - 7680 + ${sparkShufflePartitions} openaire @@ -429,24 +443,26 @@ format_ranking_results.py - --executor-memory 10G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkNormalExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + zenodo - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} - 7680 + ${sparkShufflePartitions} openaire @@ -484,14 +500,16 @@ map_openaire_ids_to_dois.py - --executor-memory 18G --executor-cores 4 --driver-memory 15G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkHighDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${openaireDataInput} @@ -526,24 +544,26 @@ map_scores_to_dois.py - --executor-memory 18G --executor-cores 4 --driver-memory 15G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkHighDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + ${synonymFolder} - 7680 + ${sparkShufflePartitions} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} ${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py @@ -576,9 +596,9 @@ eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar - --executor-memory=${sparkExecutorMemory} + --executor-memory=${sparkNormalExecutorMemory} --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} + --driver-memory=${sparkNormalDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -609,14 +629,16 @@ projects_impact.py - --executor-memory 18G --executor-cores 4 --driver-memory 10G - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 + + --executor-memory=${sparkHighExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + @@ -624,13 +646,13 @@ ${openaireDataInput}/relations - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} + ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} - 7680 + ${sparkShufflePartitions} ${projectImpactIndicatorsOutput} @@ -654,9 +676,9 @@ eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar - --executor-memory=${sparkExecutorMemory} + --executor-memory=${sparkNormalExecutorMemory} --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} + --driver-memory=${sparkNormalDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} From 12a57e1f584ffb13a7e1961b9bf79974b6e05a60 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 15 May 2023 15:59:51 +0300 Subject: [PATCH 21/41] Resolve conflicts --- .../impact_indicators/oozie_app/workflow.xml | 138 +++--------------- 1 file changed, 21 insertions(+), 117 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 8b5313c15..f185f2a8a 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -46,7 +46,7 @@ create_openaire_ranking_graph.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -57,16 +57,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + ${openaireDataInput} @@ -113,7 +104,7 @@ CC.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -124,16 +115,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + ${openaireGraphInputPath} @@ -167,7 +149,7 @@ TAR.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -178,16 +160,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + ${openaireGraphInputPath} ${ramGamma} @@ -228,7 +201,7 @@ CC.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -239,16 +212,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + ${openaireGraphInputPath} @@ -296,7 +260,7 @@ PageRank.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -307,16 +271,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + ${openaireGraphInputPath} ${pageRankAlpha} @@ -354,7 +309,7 @@ AttRank.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -365,16 +320,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + ${openaireGraphInputPath} ${attrankAlpha} @@ -459,7 +405,7 @@ format_ranking_results.py -<<<<<<< HEAD + --executor-memory=${sparkNormalExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -470,16 +416,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + json-5-way @@ -520,7 +457,7 @@ format_ranking_results.py -<<<<<<< HEAD + --executor-memory=${sparkNormalExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -531,16 +468,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + zenodo @@ -588,7 +516,7 @@ map_openaire_ids_to_dois.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -599,16 +527,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + ${openaireDataInput} @@ -643,7 +562,7 @@ map_scores_to_dois.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -654,16 +573,7 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba + ${synonymFolder} @@ -739,18 +649,12 @@ projects_impact.py -<<<<<<< HEAD + --executor-memory=${sparkHighExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkNormalDriverMemory} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} -======= - --executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} - --master yarn - --deploy-mode cluster - --conf spark.sql.shuffle.partitions=7680 ->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} From 45f2aa0867419093a866fe4686fe3c15400fe7d4 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 15 May 2023 17:52:20 +0300 Subject: [PATCH 22/41] Move end node ... at the end in workflow.xml --- .../impact_indicators/oozie_app/workflow.xml | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index f185f2a8a..bc40dfd11 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -3,7 +3,7 @@ - + @@ -714,47 +714,42 @@ - - - - - - + PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - + + CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - + + RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + ActionSet creation for results failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -767,4 +762,8 @@ ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + From b83135c252e1d90e117269ae5b7609009d370c31 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 15 May 2023 19:55:35 +0300 Subject: [PATCH 23/41] Add missing kill nodes in workflow.xml --- .../impact_indicators/oozie_app/workflow.xml | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index bc40dfd11..d2933e36f 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -715,6 +715,22 @@ + + Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -723,20 +739,16 @@ AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + Error getting key-value pairs for output files, error message[${wf:errorMessage(wf:lastErrorNode())}] - - Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}] - - RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + Error formatting BIP files, error message[${wf:errorMessage(wf:lastErrorNode())}] From 4eec3e7052756002f2f3d48561d516a3a5c003b5 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 15 May 2023 22:28:48 +0300 Subject: [PATCH 24/41] Add jobTracker, nameNode && spark2Lib as global params in oozie wf --- .../oozie_app/job.properties | 1 + .../impact_indicators/oozie_app/workflow.xml | 80 ++++++------------- 2 files changed, 24 insertions(+), 57 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties index 08f9b1eac..fb68a6928 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties @@ -99,3 +99,4 @@ actionSetOutputPath=${workingDir}/bip_actionsets/ # The directory to store project impact indicators projectImpactIndicatorsOutput=${workingDir}/project_indicators +resume=create-openaire-ranking-graph diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index d2933e36f..570dc46f5 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -1,5 +1,17 @@ + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -8,14 +20,14 @@ - ${resume eq "rankings-start"} - ${resume eq "impulse"} - ${resume eq "rankings-iterative"} - ${resume eq "format-results"} - ${resume eq "map-ids"} - ${resume eq "map-scores"} - ${resume eq "start"} - ${resume eq "projects-impact"} + ${wf:conf('resume') eq "rankings-start"} + ${wf:conf('resume') eq "impulse"} + ${wf:conf('resume') eq "rankings-iterative"} + ${wf:conf('resume') eq "format-results"} + ${wf:conf('resume') eq "map-ids"} + ${wf:conf('resume') eq "map-scores"} + ${wf:conf('resume') eq "start"} + ${wf:conf('resume') eq "projects-impact"} @@ -26,10 +38,7 @@ - - ${jobTracker} - - ${nameNode} + - - ${jobTracker} - - ${nameNode} yarn-cluster @@ -135,10 +140,6 @@ - - ${jobTracker} - - ${nameNode} yarn-cluster @@ -187,10 +188,6 @@ - - ${jobTracker} - - ${nameNode} yarn-cluster @@ -238,10 +235,6 @@ - - ${jobTracker} - - ${nameNode} @@ -295,10 +288,6 @@ - - ${jobTracker} - - ${nameNode} yarn-cluster @@ -353,10 +342,6 @@ - - ${jobTracker} - - ${nameNode} /usr/bin/bash @@ -378,7 +363,6 @@ - @@ -391,10 +375,6 @@ - - ${jobTracker} - - ${nameNode} yarn-cluster @@ -443,10 +423,6 @@ - - ${jobTracker} - - ${nameNode} yarn-cluster @@ -498,10 +474,7 @@ - - ${jobTracker} - - ${nameNode} + @@ -548,10 +521,6 @@ - - ${jobTracker} - - ${nameNode} yarn-cluster @@ -636,10 +605,7 @@ - - ${jobTracker} - - ${nameNode} + yarn-cluster cluster From 26328e2a0da67e1469c8781c15750250d915272e Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 16 May 2023 14:39:38 +0300 Subject: [PATCH 25/41] Move job.properties --- .../dhp/oa/graph/impact_indicators/{oozie_app => }/job.properties | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/{oozie_app => }/job.properties (100%) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties similarity index 100% rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties From 8ef718c3635f88358a3e44187be7b1d38b8b2c55 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 16 May 2023 16:28:48 +0300 Subject: [PATCH 26/41] Fix workflow application path --- dhp-workflows/dhp-impact-indicators/README.md | 10 ++++++++++ .../dhp/oa/graph/impact_indicators/job.properties | 9 ++++++--- .../oa/graph/impact_indicators/oozie_app/workflow.xml | 10 +++++----- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/README.md b/dhp-workflows/dhp-impact-indicators/README.md index aad94ea19..de0ad157c 100644 --- a/dhp-workflows/dhp-impact-indicators/README.md +++ b/dhp-workflows/dhp-impact-indicators/README.md @@ -24,3 +24,13 @@ mvn package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/g ``` Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use. + + +Job info and logs: +``` +export OOZIE_URL=http://iis-cdh5-test-m3:11000/oozie +oozie job -info +oozie job -log +``` + +where `jobId` is the id of the job returned by the `run_workflow.sh` script. \ No newline at end of file diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties index fb68a6928..a2f3d5828 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties @@ -76,7 +76,7 @@ bipScorePath=${workingDir}/openaire_universe_scores/ checkpointDir=${nameNode}/${workingDir}/check/ # The directory for the doi-based bip graph -bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph +# bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph # The folder from which synonyms of openaire-ids are read # openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/ @@ -89,9 +89,12 @@ synonymFolder=${nameNode}/${workingDir}/openaireid_to_dois/ openaireGraphInputPath=${nameNode}/${workingDir}/openaire_id_graph # The workflow application path -wfAppPath=${nameNode}/${oozieWorkflowPath} +wfAppPath=${oozieTopWfApplicationPath} + # The following is needed as a property of a workflow -oozie.wf.application.path=${wfAppPath} +#oozie.wf.application.path=${wfAppPath} +oozie.wf.application.path=${oozieTopWfApplicationPath} + # Path where the final output should be? actionSetOutputPath=${workingDir}/bip_actionsets/ diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 570dc46f5..285a66382 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -126,7 +126,7 @@ ${sparkShufflePartitions} - ${wfAppPath}/CC.py#CC.py + ${wfAppPath}/bip-ranker/CC.py#CC.py @@ -171,7 +171,7 @@ ${sparkShufflePartitions} ${checkpointDir} - ${wfAppPath}/TAR.py#TAR.py + ${wfAppPath}/bip-ranker/TAR.py#TAR.py @@ -216,7 +216,7 @@ ${sparkShufflePartitions} 3 - ${wfAppPath}/CC.py#CC.py + ${wfAppPath}/bip-ranker/CC.py#CC.py @@ -274,7 +274,7 @@ ${sparkShufflePartitions} dfs - ${wfAppPath}/PageRank.py#PageRank.py + ${wfAppPath}/bip-ranker/PageRank.py#PageRank.py @@ -324,7 +324,7 @@ ${sparkShufflePartitions} dfs - ${wfAppPath}/AttRank.py#AttRank.py + ${wfAppPath}/bip-ranker/AttRank.py#AttRank.py From 3c38f7ba6f53f735e4eb23370cc3d09eedcc808a Mon Sep 17 00:00:00 2001 From: ikanellos Date: Tue, 16 May 2023 17:32:53 +0300 Subject: [PATCH 27/41] Fix selection of columns in graph creation --- .../oozie_app/create_openaire_ranking_graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py index 182fd9309..50d2cd99b 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py @@ -132,8 +132,8 @@ cites_df = spark.read.json(graph_folder + "/relation")\ & (F.col('dataInfo.invisible') == "false"))\ .drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\ .repartition(num_partitions, 'citing').drop('relClass')\ - .withColumn('collected_lower', F.expr('transform(collectedfrom.value, x -> lower(x))'))\ - .drop('collectedfrom.value')\ + .withColumn('collected_lower', F.expr('transform(value, x -> lower(x))'))\ + .drop('value')\ .where( (F.array_contains(F.col('collected_lower'), "opencitations")) | (F.array_contains(F.col('collected_lower'), "crossref")) From 3d69f33c847b39fd9eb8de3a3cf93d5535a2438f Mon Sep 17 00:00:00 2001 From: ikanellos Date: Tue, 16 May 2023 17:34:42 +0300 Subject: [PATCH 28/41] Fix selection of columns in graph creation --- .../oozie_app/create_openaire_ranking_graph.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py index 50d2cd99b..3d131933d 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py @@ -131,8 +131,10 @@ cites_df = spark.read.json(graph_folder + "/relation")\ & (F.col('dataInfo.deletedbyinference') == "false")\ & (F.col('dataInfo.invisible') == "false"))\ .drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\ + .drop('deletedbyinference').drop('invisible')\ .repartition(num_partitions, 'citing').drop('relClass')\ .withColumn('collected_lower', F.expr('transform(value, x -> lower(x))'))\ + .drop('collectedfrom.value')\ .drop('value')\ .where( (F.array_contains(F.col('collected_lower'), "opencitations")) From ec4e01068759a48fdfcd94d4e3854059b61f0d42 Mon Sep 17 00:00:00 2001 From: ikanellos Date: Tue, 23 May 2023 16:44:04 +0300 Subject: [PATCH 29/41] End after rankings | Create graph debugged --- .../oozie_app/create_openaire_ranking_graph.py | 10 ++++++++-- .../oa/graph/impact_indicators/oozie_app/workflow.xml | 5 +++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py index 3d131933d..6dd4427b9 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py @@ -114,6 +114,12 @@ print ("Total num of research objects: " + str(oa_objects_df.count())) # Keep only required fields - we still keep resulttype.classname to # filter the citation relationships we consider valid oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache() + +''' +print ("OA objects Schema:") +oa_objects_df.printSchema() +sys.exit(0) +''' ############################################################################################################################ # 2. Get the relation objects and filter them based on their existence in the oa_objects_df # NOTE: we are only interested in citations of type "cites" @@ -154,8 +160,8 @@ cites_df = spark.read.json(graph_folder + "/relation")\ # references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache() # print ("References df now has: " + str(references_df.count()) + " entries") -cites_df = cites_df.join(oa_objects_df.select('id'), cites_df.citing == oa_objects_df.id).where( F.col('resulttype.classname').isin(valid_result_types) ).drop('id').drop('resulttype.classname') -cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).drop('id').drop('resulttype.classname').distinct().repartition(num_partitions, 'citing').cache() +cites_df = cites_df.join(oa_objects_df.select('id', 'classname'), cites_df.citing == oa_objects_df.id).where( F.col('classname').isin(valid_result_types) ).drop('id').drop('classname') +cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).distinct().repartition(num_partitions, 'citing').cache() # TODO: add here a clause filtering out the citations # originating from "other" types of research objects which we consider valid diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 285a66382..78cf92bd2 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -81,7 +81,7 @@ - + @@ -335,7 +335,8 @@ - + + From 6a7e370a21d23ec987291da0faa7994f814106e5 Mon Sep 17 00:00:00 2001 From: ikanellos Date: Tue, 23 May 2023 16:48:58 +0300 Subject: [PATCH 30/41] Remove unnecessary counts in graph creation --- .../oozie_app/create_openaire_ranking_graph.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py index 6dd4427b9..2b6b4aae9 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py @@ -196,15 +196,19 @@ oa_objects_df.printSchema() # cited_by_df.unpersist(True) # Show total num of unique citations +''' num_unique_citations = citations_df.count() print ("Total unique citations: " + str(num_unique_citations)) +''' ############################################################################################################################ # 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references) dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\ .select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing') # Count dangling nodes +''' dangling_num = dangling_nodes.count() print ("Number of dangling nodes: " + str(dangling_num)) +''' # print ("Dangling nodes sample:") # dangling_nodes.show(10, False) ############################################################################################################################ @@ -213,8 +217,10 @@ graph = citations_df.groupBy('citing').agg(F.collect_set('cited').alias('cited') # Free space citations_df.unpersist(True) +''' num_nodes = graph.count() print ("Entries in graph before dangling nodes:" + str(num_nodes)) +''' # print ("Sample in graph: ") # graph.show(10, False) From a1b9187039639d0eaf194b5982591850cf688805 Mon Sep 17 00:00:00 2001 From: ikanellos Date: Tue, 23 May 2023 17:17:12 +0300 Subject: [PATCH 31/41] Fix syntax error on workflow.xml --- .../dhp/oa/graph/impact_indicators/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 78cf92bd2..9bd582984 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -335,7 +335,7 @@ - + From 60f25b780de1c456762003cbb8b0011c9c82f93d Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Fri, 23 Jun 2023 12:51:50 +0300 Subject: [PATCH 32/41] Minor fixes in workflow.xml and job.properties --- .../oa/graph/impact_indicators/job.properties | 2 +- .../impact_indicators/oozie_app/workflow.xml | 95 ++++++++++--------- 2 files changed, 49 insertions(+), 48 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties index a2f3d5828..860a14713 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties @@ -102,4 +102,4 @@ actionSetOutputPath=${workingDir}/bip_actionsets/ # The directory to store project impact indicators projectImpactIndicatorsOutput=${workingDir}/project_indicators -resume=create-openaire-ranking-graph +resume=entry-point-decision diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 9bd582984..1d49322b6 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -33,7 +33,7 @@ - + @@ -44,12 +44,12 @@ - --> - - - yarn-cluster + --> + + + yarn-cluster cluster - + Openaire Ranking Graph Creation @@ -141,8 +141,8 @@ - - yarn-cluster + + yarn-cluster cluster @@ -244,8 +244,8 @@ - - yarn-cluster + + yarn-cluster cluster @@ -289,8 +289,8 @@ - - yarn-cluster + + yarn-cluster cluster @@ -335,8 +335,8 @@ - - + + @@ -349,7 +349,7 @@ get_ranking_files.sh - /${workingDir} + ${workingDir} ${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh @@ -481,8 +481,8 @@ - - yarn-cluster + + yarn-cluster cluster @@ -503,7 +503,7 @@ - ${openaireDataInput} + ${openaireDataInput}/ ${synonymFolder} @@ -523,8 +523,8 @@ - - yarn-cluster + + yarn-cluster cluster @@ -561,47 +561,48 @@ + - - + + - - - + + + - - - yarn - cluster - Produces the atomic action with the bip finder scores for publications - eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob - dhp-aggregation-${projectVersion}.jar - - --executor-memory=${sparkNormalExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkNormalDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --inputPath${bipScorePath} - --outputPath${actionSetOutputPath}/results/ + + + yarn + cluster + Produces the atomic action with the bip finder scores for publications + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkNormalExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkNormalDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${bipScorePath} + --outputPath${actionSetOutputPath}/results/ --targetEntityresult - - - + + + From 772d5f0aab8b680688ef66bc27b171c64e93d78f Mon Sep 17 00:00:00 2001 From: ikanellos Date: Thu, 6 Jul 2023 13:47:51 +0300 Subject: [PATCH 33/41] Make PR and AttRank serial --- .../impact_indicators/oozie_app/workflow.xml | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 9bd582984..8466e03e5 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -22,7 +22,10 @@ ${wf:conf('resume') eq "rankings-start"} ${wf:conf('resume') eq "impulse"} - ${wf:conf('resume') eq "rankings-iterative"} + ${wf:conf('resume') eq "pagerank"} + ${wf:conf('resume') eq "attrank"} + + ${wf:conf('resume') eq "format-results"} ${wf:conf('resume') eq "map-ids"} ${wf:conf('resume') eq "map-scores"} @@ -220,16 +223,19 @@ - + + + @@ -278,7 +284,7 @@ - + @@ -328,15 +334,18 @@ - + - + + + to="get-file-names"/> + --> From d5c39a10596f732d9a17fdb9d6c5abe014f88c4c Mon Sep 17 00:00:00 2001 From: ikanellos Date: Thu, 6 Jul 2023 15:04:48 +0300 Subject: [PATCH 34/41] Fix map scores to doi --- .../oa/graph/impact_indicators/job.properties | 2 +- .../oozie_app/map_scores_to_dois.py | 28 ++++++++++++------- 2 files changed, 19 insertions(+), 11 deletions(-) mode change 100644 => 100755 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties index a2f3d5828..ea68ade1a 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties @@ -47,7 +47,7 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster # current year used when creating graph / by some ranking methods -currentYear=2024 +currentYear=2023 # Alpha value for pagerank pageRankAlpha=0.5 diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py old mode 100644 new mode 100755 index 0d294e045..0fc67eb53 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py @@ -1,3 +1,4 @@ +#!/usr/bin/python # This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow # and uses this mapping to create doi-based score files in the format required by BiP! DB. # This is done by reading each openaire-id based ranking file and joining the openaire based @@ -17,28 +18,35 @@ import pyspark.sql.functions as F # from pyspark.sql.functions import udf ################################################################################################# ################################################################################################# -# Clean up directory name +# Clean up directory name - no longer needed in final workflow version +''' def clean_directory_name(dir_name): # We have a name with the form *_bip_universe_* or *_graph_universe_* # and we need to keep the parts in * + + dir_name_parts = dir_name.split('_') dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)] - - clean_name = '_'.join(dir_name_parts) + + dir_name = dir_name.replace("openaire_id_graph", "openaire_ids") + clean_name = dir_name + ".txt.gz" - if '_ids' not in clean_name: - clean_name = clean_name.replace('id_', 'ids_') + # clean_name = '_'.join(dir_name_parts) + + # if '_ids' not in clean_name: + # clean_name = clean_name.replace('id_', 'ids_') # clean_name = clean_name.replace('.txt', '') # clean_name = clean_name.replace('.gz', '') - if 'openaire_ids_' in clean_name: - clean_name = clean_name.replace('openaire_ids_', '') + # if 'openaire_ids_' in clean_name: + # clean_name = clean_name.replace('openaire_ids_', '') # clean_name = clean_name + '.txt.gz' # else: # clean_name = clean_name + '.txt.gz' return clean_name +''' ################################################################################################# if len(sys.argv) < 3: print ("Usage: ./map_scores_to_dois.py <...etc...>") @@ -47,12 +55,12 @@ if len(sys.argv) < 3: # Read arguments synonyms_folder = sys.argv[1] num_partitions = int(sys.argv[2]) -input_file_list = [argument for argument in sys.argv[3:]] -input_file_list = [clean_directory_name(item) for item in input_file_list] +input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]] +# input_file_list = [clean_directory_name(item) for item in input_file_list] # Prepare output specific variables output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list] -output_file_list = [item + ".gz" if not item.endswith(".gz") else item for item in output_file_list] +output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list] # --- INFO MESSAGES --- # print ("\n\n----------------------------") From db4ca43ee84aa29610bbf6dcbef6f921bf57e13c Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 18 Jul 2023 18:38:26 +0300 Subject: [PATCH 35/41] Resolve conflict --- .../dhp/oa/graph/impact_indicators/oozie_app/workflow.xml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 1f344ba5a..6eb783941 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -340,19 +340,12 @@ -<<<<<<< HEAD -======= - - - ->>>>>>> 60f25b780de1c456762003cbb8b0011c9c82f93d - From 03da9651620f363ba23720b8f131c084856f28dd Mon Sep 17 00:00:00 2001 From: ikanellos Date: Fri, 21 Jul 2023 13:42:30 +0300 Subject: [PATCH 36/41] Format bip-score based file without doi references --- .../oozie_app/format_ranking_results.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py index e7d62c2f1..8dbbe3ad3 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py @@ -213,7 +213,10 @@ if mode == 'bip': cc_dir = sys.argv[4] impulse_dir = sys.argv[5] ram_dir = sys.argv[6] - refs_dir = sys.argv[7] + + # NOTE: This was used initial, but @Serafeim told me to remove it since we don't get doi-doi referencew anymore + # In case of emergency, bring this back + # refs_dir = sys.argv[7] # Score-specific dataframe pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id') @@ -221,7 +224,7 @@ if mode == 'bip': cc_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id') impulse_df = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id') ram_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id') - refs_df = spark.read.schema(refs_schema).option('delimiter', '\t').option('header',True).csv(refs_dir).repartition(num_partitions, 'id') + # refs_df = spark.read.schema(refs_schema).option('delimiter', '\t').option('header',True).csv(refs_dir).repartition(num_partitions, 'id') # ----------- TESTING CODE --------------- # # pagerank_entries = pagerank_df.count() @@ -258,9 +261,10 @@ if mode == 'bip': .select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', 'cc', 'cc_normalized',\ '3-cc', '3-cc_normalized', F.col('score').alias('ram')) - # Add references - results_df = results_df.join(refs_df, ['id']).select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', \ - 'cc', 'cc_normalized', '3-cc', '3-cc_normalized', 'ram', 'num_refs') + # Add references - THIS WAS REMOVED SINCE WE DON't GET DOI REFERENCES + # In case of emergency bring back + # results_df = results_df.join(refs_df, ['id']).select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', \ + # 'cc', 'cc_normalized', '3-cc', '3-cc_normalized', 'ram', 'num_refs') # Write resulting dataframe to file output_dir = "/".join(pagerank_dir.split('/')[:-1]) From 2cc5b1a39b36f6c0bc35a23cd2c76b7e04609eaf Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Fri, 21 Jul 2023 15:26:50 +0300 Subject: [PATCH 37/41] Fixes in workflow.xml --- .../bipfinder/SparkAtomicActionScoreJob.java | 20 ++-- .../score/deserializers/BipProjectModel.java | 97 ++++++++++--------- .../score/deserializers/BipResultModel.java | 4 +- .../PrepareBipFinder.java | 2 +- .../SparkAtomicActionScoreJobTest.java | 28 +++--- .../project/PrepareH2020ProgrammeTest.java | 2 +- .../project/ReadProjectsTest.java | 2 +- .../actionmanager/project/ReadTopicTest.java | 2 +- .../oa/graph/impact_indicators/job.properties | 2 +- .../impact_indicators/oozie_app/workflow.xml | 33 ++++--- 10 files changed, 100 insertions(+), 92 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java index 8b8e05723..fb11e829f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java @@ -9,7 +9,6 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; -import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; @@ -25,8 +24,9 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel; import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore; +import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel; +import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; @@ -89,8 +89,7 @@ public class SparkAtomicActionScoreJob implements Serializable { default: throw new RuntimeException("Unknown target entity: " + targetEntity); } - } - ); + }); } private static void prepareProjects(SparkSession spark, String inputPath, String outputPath) { @@ -98,17 +97,18 @@ public class SparkAtomicActionScoreJob implements Serializable { // read input bip project scores Dataset projectScores = readPath(spark, inputPath, BipProjectModel.class); - projectScores.map( (MapFunction) bipProjectScores -> { + projectScores.map((MapFunction) bipProjectScores -> { Project project = new Project(); project.setId(bipProjectScores.getProjectId()); project.setMeasures(bipProjectScores.toMeasures()); return project; }, Encoders.bean(Project.class)) - .toJavaRDD() - .map(p -> new AtomicAction(Project.class, p)) - .mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), - new Text(OBJECT_MAPPER.writeValueAsString(aa)))) - .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + .toJavaRDD() + .map(p -> new AtomicAction(Project.class, p)) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java index 77c1567a8..680e12504 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java @@ -1,69 +1,74 @@ + package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers; -import com.opencsv.bean.CsvBindByPosition; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -import eu.dnetlib.dhp.schema.oaf.Measure; +import static eu.dnetlib.dhp.actionmanager.Constants.*; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; -import static eu.dnetlib.dhp.actionmanager.Constants.*; +import com.opencsv.bean.CsvBindByPosition; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Measure; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; @NoArgsConstructor @AllArgsConstructor @Getter @Setter public class BipProjectModel { - String projectId; + String projectId; - String numOfInfluentialResults; + String numOfInfluentialResults; - String numOfPopularResults; + String numOfPopularResults; - String totalImpulse; + String totalImpulse; - String totalCitationCount; + String totalCitationCount; - // each project bip measure has exactly one value, hence one key-value pair - private Measure createMeasure(String measureId, String measureValue) { + // each project bip measure has exactly one value, hence one key-value pair + private Measure createMeasure(String measureId, String measureValue) { - KeyValue kv = new KeyValue(); - kv.setKey("score"); - kv.setValue(measureValue); - kv.setDataInfo( - OafMapperUtils.dataInfo( - false, - UPDATE_DATA_INFO_TYPE, - true, - false, - OafMapperUtils.qualifier( - UPDATE_MEASURE_BIP_CLASS_ID, - UPDATE_CLASS_NAME, - ModelConstants.DNET_PROVENANCE_ACTIONS, - ModelConstants.DNET_PROVENANCE_ACTIONS), - "") - ); + KeyValue kv = new KeyValue(); + kv.setKey("score"); + kv.setValue(measureValue); + kv + .setDataInfo( + OafMapperUtils + .dataInfo( + false, + UPDATE_DATA_INFO_TYPE, + true, + false, + OafMapperUtils + .qualifier( + UPDATE_MEASURE_BIP_CLASS_ID, + UPDATE_CLASS_NAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "")); - Measure measure = new Measure(); - measure.setId(measureId); - measure.setUnit(Collections.singletonList(kv)); - return measure; - } - public List toMeasures() { - return Arrays.asList( - createMeasure("numOfInfluentialResults", numOfInfluentialResults), - createMeasure("numOfPopularResults", numOfPopularResults), - createMeasure("totalImpulse", totalImpulse), - createMeasure("totalCitationCount", totalCitationCount) - ); - } + Measure measure = new Measure(); + measure.setId(measureId); + measure.setUnit(Collections.singletonList(kv)); + return measure; + } + + public List toMeasures() { + return Arrays + .asList( + createMeasure("numOfInfluentialResults", numOfInfluentialResults), + createMeasure("numOfPopularResults", numOfPopularResults), + createMeasure("totalImpulse", totalImpulse), + createMeasure("totalCitationCount", totalCitationCount)); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java index 06a173413..f992dc59f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java @@ -1,13 +1,13 @@ package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers; -import eu.dnetlib.dhp.actionmanager.bipmodel.Score; - import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import eu.dnetlib.dhp.actionmanager.bipmodel.Score; + /** * Class that maps the model of the bipFinder! input data. * Only needed for deserialization purposes diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java index efcb96a85..0507f90e5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java @@ -24,8 +24,8 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel; import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore; +import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.common.ModelConstants; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java index aa5a19f11..7752fbc27 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java @@ -7,8 +7,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Project; import org.apache.commons.io.FileUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -28,6 +26,8 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; public class SparkAtomicActionScoreJobTest { @@ -73,15 +73,16 @@ public class SparkAtomicActionScoreJobTest { } private void runJob(String inputPath, String outputPath, String targetEntity) throws Exception { - SparkAtomicActionScoreJob.main( - new String[] { + SparkAtomicActionScoreJob + .main( + new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-inputPath", inputPath, "-outputPath", outputPath, "-targetEntity", targetEntity, - } - ); + }); } + @Test void testResultScores() throws Exception { final String targetEntity = RESULT; @@ -149,8 +150,8 @@ public class SparkAtomicActionScoreJobTest { void testProjectScores() throws Exception { String targetEntity = PROJECT; String inputResultScores = getClass() - .getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json") - .getPath(); + .getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json") + .getPath(); String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet"; // execute the job to generate the action sets for project scores @@ -159,9 +160,9 @@ public class SparkAtomicActionScoreJobTest { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD projects = sc - .sequenceFile(outputPath, Text.class, Text.class) - .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) - .map(aa -> ((Project) aa.getPayload())); + .sequenceFile(outputPath, Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Project) aa.getPayload())); // test the number of projects assertEquals(4, projects.count()); @@ -171,7 +172,8 @@ public class SparkAtomicActionScoreJobTest { // count that the project with id testProjectId is present assertEquals(1, projects.filter(row -> row.getId().equals(testProjectId)).count()); - projects.filter(row -> row.getId().equals(testProjectId)) + projects + .filter(row -> row.getId().equals(testProjectId)) .flatMap(r -> r.getMeasures().iterator()) .foreach(m -> { log.info(m.getId() + " " + m.getUnit()); @@ -184,7 +186,7 @@ public class SparkAtomicActionScoreJobTest { // ensure that the correct key is provided, i.e. score assertEquals("score", kv.getKey()); - switch(m.getId()) { + switch (m.getId()) { case "numOfInfluentialResults": assertEquals("0", kv.getValue()); break; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java index c68bfa13a..b30658feb 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java @@ -92,7 +92,7 @@ public class PrepareH2020ProgrammeTest { Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count()); - //tmp.foreach(csvProgramme -> System.out.println(OBJECT_MAPPER.writeValueAsString(csvProgramme))); + // tmp.foreach(csvProgramme -> System.out.println(OBJECT_MAPPER.writeValueAsString(csvProgramme))); Assertions .assertEquals( diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsTest.java index 4be09c4b7..0d92c48a8 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsTest.java @@ -98,7 +98,7 @@ public class ReadProjectsTest { Assertions.assertEquals("H2020-EU.1.3.", project.getLegalBasis()); Assertions.assertEquals("MSCA-IF-2019", project.getTopics()); - //tmp.foreach(p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p))); + // tmp.foreach(p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p))); } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadTopicTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadTopicTest.java index bdb0cc3a1..82a9e6aed 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadTopicTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadTopicTest.java @@ -93,7 +93,7 @@ public class ReadTopicTest { Assertions.assertEquals("Individual Fellowships", topic.getTitle()); Assertions.assertEquals("MSCA-IF-2019", topic.getTopic()); - //tmp.foreach(p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p))); + // tmp.foreach(p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p))); } } diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties index 9d6c94ca9..b1598910d 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties @@ -97,7 +97,7 @@ oozie.wf.application.path=${oozieTopWfApplicationPath} # Path where the final output should be? -actionSetOutputPath=${workingDir}/bip_actionsets/ +actionSetOutputPath=${workingDir}/bip_actionsets # The directory to store project impact indicators projectImpactIndicatorsOutput=${workingDir}/project_indicators diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 6eb783941..65067dace 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -24,20 +24,21 @@ ${wf:conf('resume') eq "impulse"} ${wf:conf('resume') eq "pagerank"} ${wf:conf('resume') eq "attrank"} - ${wf:conf('resume') eq "format-results"} ${wf:conf('resume') eq "map-ids"} ${wf:conf('resume') eq "map-scores"} ${wf:conf('resume') eq "start"} - ${wf:conf('resume') eq "projects-impact"} - + + ${wf:conf('resume') eq "projects-impact"} + ${wf:conf('resume') eq "projects-impact-actionsets"} + - + @@ -479,7 +480,7 @@ - + @@ -526,7 +527,7 @@ - + @@ -568,14 +569,14 @@ - - + + - + @@ -583,13 +584,13 @@ - + - + - yarn + yarn-cluster cluster Produces the atomic action with the bip finder scores for publications eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob @@ -640,7 +641,7 @@ - ${openaireDataInput}/relations + ${openaireDataInput}/relation ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} @@ -658,16 +659,16 @@ - + - + - yarn + yarn-cluster cluster Produces the atomic action with the bip finder scores for projects eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob From cb0f3c50f69bba7c0db137117e973bb8bd865c3f Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Fri, 21 Jul 2023 16:07:10 +0300 Subject: [PATCH 38/41] Format workflow.xml --- .../impact_indicators/oozie_app/workflow.xml | 182 +++--------------- 1 file changed, 32 insertions(+), 150 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 65067dace..349e054d8 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -13,7 +13,6 @@ - @@ -38,27 +37,14 @@ - + - - - - - yarn-cluster cluster - - - Openaire Ranking Graph Creation - + OpenAIRE Ranking Graph Creation create_openaire_ranking_graph.py - --executor-memory=${sparkHighExecutorMemory} @@ -80,39 +66,30 @@ ${sparkShufflePartitions} ${openaireGraphInputPath} - + ${wfAppPath}/create_openaire_ranking_graph.py#create_openaire_ranking_graph.py - - - - + - - yarn-cluster cluster - - - Spark CC - + Citation Count calculation CC.py - --executor-memory=${sparkHighExecutorMemory} @@ -129,31 +106,23 @@ ${openaireGraphInputPath} ${sparkShufflePartitions} - + ${wfAppPath}/bip-ranker/CC.py#CC.py - - - + - - yarn-cluster cluster - - - Spark RAM - + RAM calculation TAR.py - --executor-memory=${sparkHighExecutorMemory} @@ -171,37 +140,27 @@ ${ramGamma} ${currentYear} RAM - ${sparkShufflePartitions} ${checkpointDir} - + ${wfAppPath}/bip-ranker/TAR.py#TAR.py - - - + - - - yarn-cluster cluster - - - Spark Impulse - + Impulse calculation CC.py - --executor-memory=${sparkHighExecutorMemory} @@ -219,47 +178,22 @@ ${sparkShufflePartitions} 3 - + ${wfAppPath}/bip-ranker/CC.py#CC.py - - - - - - - - - - - - - - - - yarn-cluster cluster - - - Spark Pagerank - + Pagerank calculation PageRank.py - --executor-memory=${sparkHighExecutorMemory} @@ -280,31 +214,22 @@ ${sparkShufflePartitions} dfs - + ${wfAppPath}/bip-ranker/PageRank.py#PageRank.py - - - - - yarn-cluster cluster - - - Spark AttRank - + AttRank calculation AttRank.py - --executor-memory=${sparkHighExecutorMemory} @@ -330,27 +255,16 @@ ${sparkShufflePartitions} dfs - + ${wfAppPath}/bip-ranker/AttRank.py#AttRank.py - - - - - - - @@ -360,15 +274,12 @@ ${workingDir} - ${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh - - @@ -383,18 +294,12 @@ - - yarn-cluster cluster - - Format Ranking Results JSON - format_ranking_results.py - --executor-memory=${sparkNormalExecutorMemory} @@ -419,13 +324,11 @@ ${sparkShufflePartitions} openaire - + ${wfAppPath}/format_ranking_results.py#format_ranking_results.py - - @@ -471,18 +374,15 @@ ${wfAppPath}/format_ranking_results.py#format_ranking_results.py - - - + - + - @@ -490,15 +390,10 @@ - yarn-cluster cluster - - Openaire-DOI synonym collection - map_openaire_ids_to_dois.py - --executor-memory=${sparkHighExecutorMemory} @@ -515,19 +410,16 @@ ${openaireDataInput}/ ${synonymFolder} - + ${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py - - - - + @@ -535,12 +427,8 @@ yarn-cluster cluster - - Mapping Openaire Scores to DOIs - map_scores_to_dois.py - --executor-memory=${sparkHighExecutorMemory} @@ -564,18 +452,15 @@ ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} - ${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py - - - + @@ -590,11 +475,13 @@ + yarn-cluster cluster Produces the atomic action with the bip finder scores for publications eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar + --executor-memory=${sparkNormalExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -609,23 +496,19 @@ --outputPath${actionSetOutputPath}/results/ --targetEntityresult + + - - yarn-cluster cluster - - - Project Impact Indicators - + Project Impact Indicators calculation projects_impact.py - --executor-memory=${sparkHighExecutorMemory} @@ -639,7 +522,6 @@ - ${openaireDataInput}/relation @@ -653,26 +535,23 @@ ${sparkShufflePartitions} ${projectImpactIndicatorsOutput} - - ${wfAppPath}/projects_impact.py#projects_impact.py - - - + yarn-cluster cluster Produces the atomic action with the bip finder scores for projects eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar + --executor-memory=${sparkNormalExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -683,12 +562,15 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --inputPath${projectImpactIndicatorsOutput} --outputPath${actionSetOutputPath}/projects/ --targetEntityproject + + From 2374f445a941a76fe239a95e75a5e491c12a22bf Mon Sep 17 00:00:00 2001 From: ikanellos Date: Fri, 21 Jul 2023 17:42:46 +0300 Subject: [PATCH 39/41] Produce additional bip update specific files --- .../oozie_app/map_scores_to_dois.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py index 0fc67eb53..f6a8e9996 100755 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py @@ -15,6 +15,8 @@ from pyspark.sql.types import * # Import sql functions with shorthand alias import pyspark.sql.functions as F + +from pyspark.sql.functions import max # from pyspark.sql.functions import udf ################################################################################################# ################################################################################################# @@ -127,6 +129,10 @@ for offset, input_file in enumerate(input_file_list): # Load file to dataframe ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id') + + # Get max score + max_score = ranking_df.select(max('score').alias('max')).collect()[0]['max'] + print ("Max Score for " + str(input_file) + " is " + str(max_score)) # TESTING # print ("Loaded df sample:") @@ -138,6 +144,15 @@ for offset, input_file in enumerate(input_file_list): output_file = output_file_list[offset] print ("Writing to: " + output_file) doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip') + + # Creata another file for the bip update process + ranking_df = ranking_df.select('id', 'score', F.lit(F.col('score')/max_score).alias('normalized_score'), 'class', F.col('class').alias('class_dup')) + doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'normalized_score', 'class', 'class_dup').repartition(num_partitions, 'doi').cache() + output_file = output_file.replace(".txt.gz", "_for_bip_update.txt.gz") + print ("Writing bip update to: " + output_file) + doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip') + + # Free memory? ranking_df.unpersist(True) From 3a0f09774a941b38e641b2d74ea073a9b6bce187 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Fri, 21 Jul 2023 17:55:41 +0300 Subject: [PATCH 40/41] Add script to find score limits --- .../oozie_app/get_score_limits.sh | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh new file mode 100644 index 000000000..6d4161d7f --- /dev/null +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh @@ -0,0 +1,63 @@ +#/usr/bin/bash + +# Read log files from ranking scripts and create a two-line file +# with score limits for the various measures. To be used by Kleanthis + +attrank_file=$(ls *attrank*.log); +pr_file=$(ls *pagerank*.log) +ram_file=$(ls *ram*.log); +cc_file=$(ls *cc*.log); +impulse_file=$(ls *impulse*.log); + +echo +echo "-----------------------------" +echo "Attrank file:${attrank_file}"; +echo "PageRank file:${pr_file}"; +echo "RAM file:${ram_file}"; +echo "CC file:${cc_file}"; +echo "Impulse file:${impulse_file}"; +echo "-----------------------------" +echo +echo + +# output file will be called score_limits.csv +echo -e "influence_top001\tinfluence_top01\tinfluence_top1\tinfluence_top10\tpopularity_top001\tpopularity_top01\tpopularity_top1\tpopularity_top10\timpulse_top001\timpulse_top01\timpulse_top1\timpulse_top10\tcc_top001\tcc_top01\tcc_top1\tcc_top10" > score_limits.csv +# ---------------------------------------------------- # +# Get respective score limits (we don't need RAM) +inf_001=$(grep "^0.01%" ${pr_file} | cut -f 2); +inf_01=$(grep "^0.1%" ${pr_file} | cut -f 2); +inf_1=$(grep "^1%" ${pr_file} | cut -f 2); +inf_10=$(grep "^10%" ${pr_file} | cut -f 2); +echo "Influnence limits:" +echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}"; +# ---------------------------------------------------- # +pop_001=$(grep "^0.01%" ${attrank_file} | cut -f 2); +pop_01=$(grep "^0.1%" ${attrank_file} | cut -f 2); +pop_1=$(grep "^1%" ${attrank_file} | cut -f 2); +pop_10=$(grep "^10%" ${attrank_file} | cut -f 2); +echo "Popularity limits:"; +echo -e "${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}"; +# ---------------------------------------------------- # +imp_001=$(grep "^0.01%" ${impulse_file} | cut -f 2); +imp_01=$(grep "^0.1%" ${impulse_file} | cut -f 2); +imp_1=$(grep "^1%" ${impulse_file} | cut -f 2); +imp_10=$(grep "^10%" ${impulse_file} | cut -f 2); +echo "Popularity limits:"; +echo -e "${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}"; +# ---------------------------------------------------- # +cc_001=$(grep "^0.01%" ${cc_file} | cut -f 2); +cc_01=$(grep "^0.1%" ${cc_file} | cut -f 2); +cc_1=$(grep "^1%" ${cc_file} | cut -f 2); +cc_10=$(grep "^10%" ${cc_file} | cut -f 2); +echo "Popularity limits:"; +echo -e "${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}"; +# ---------------------------------------------------- # + +echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}\t${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}\t${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}\t${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}" >> score_limits.csv + +echo +echo "score_limits.csv contents:" +cat score_limits.csv + +echo; +echo; From 97c1ba89187b5c57b6cac3263dd8c9d855c586d8 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Fri, 11 Aug 2023 15:56:53 +0300 Subject: [PATCH 41/41] Merge actionsets of results and projects --- .../bipfinder/SparkAtomicActionScoreJob.java | 62 ++++--- .../bipfinder/input_actionset_parameter.json | 18 +-- .../SparkAtomicActionScoreJobTest.java | 152 +++++++++--------- .../impact_indicators/oozie_app/workflow.xml | 76 +++------ 4 files changed, 130 insertions(+), 178 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java index fb11e829f..040c89782 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java @@ -6,13 +6,14 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.List; -import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; @@ -41,8 +42,6 @@ import scala.Tuple2; */ public class SparkAtomicActionScoreJob implements Serializable { - private static final String RESULT = "result"; - private static final String PROJECT = "project"; private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -61,15 +60,15 @@ public class SparkAtomicActionScoreJob implements Serializable { Boolean isSparkSessionManaged = isSparkSessionManaged(parser); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + final String resultsInputPath = parser.get("resultsInputPath"); + log.info("resultsInputPath: {}", resultsInputPath); + + final String projectsInputPath = parser.get("projectsInputPath"); + log.info("projectsInputPath: {}", projectsInputPath); final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); - final String targetEntity = parser.get("targetEntity"); - log.info("targetEntity: {}", targetEntity); - SparkConf conf = new SparkConf(); runWithSparkSession( @@ -78,26 +77,23 @@ public class SparkAtomicActionScoreJob implements Serializable { spark -> { removeOutputDir(spark, outputPath); - // follow different procedures for different target entities - switch (targetEntity) { - case RESULT: - prepareResults(spark, inputPath, outputPath); - break; - case PROJECT: - prepareProjects(spark, inputPath, outputPath); - break; - default: - throw new RuntimeException("Unknown target entity: " + targetEntity); - } + JavaPairRDD resultsRDD = prepareResults(spark, resultsInputPath, outputPath); + JavaPairRDD projectsRDD = prepareProjects(spark, projectsInputPath, outputPath); + + resultsRDD + .union(projectsRDD) + .saveAsHadoopFile( + outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); }); } - private static void prepareProjects(SparkSession spark, String inputPath, String outputPath) { + private static JavaPairRDD prepareProjects(SparkSession spark, String inputPath, + String outputPath) { // read input bip project scores Dataset projectScores = readPath(spark, inputPath, BipProjectModel.class); - projectScores.map((MapFunction) bipProjectScores -> { + return projectScores.map((MapFunction) bipProjectScores -> { Project project = new Project(); project.setId(bipProjectScores.getProjectId()); project.setMeasures(bipProjectScores.toMeasures()); @@ -107,12 +103,12 @@ public class SparkAtomicActionScoreJob implements Serializable { .map(p -> new AtomicAction(Project.class, p)) .mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), - new Text(OBJECT_MAPPER.writeValueAsString(aa)))) - .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + new Text(OBJECT_MAPPER.writeValueAsString(aa)))); } - private static void prepareResults(SparkSession spark, String bipScorePath, String outputPath) { + private static JavaPairRDD prepareResults(SparkSession spark, String bipScorePath, + String outputPath) { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); @@ -128,24 +124,20 @@ public class SparkAtomicActionScoreJob implements Serializable { return bs; }).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class)); - bipScores + return bipScores.map((MapFunction) bs -> { + Result ret = new Result(); - .map((MapFunction) bs -> { - Result ret = new Result(); + ret.setId(bs.getId()); - ret.setId(bs.getId()); + ret.setMeasures(getMeasure(bs)); - ret.setMeasures(getMeasure(bs)); - - return ret; - }, Encoders.bean(Result.class)) + return ret; + }, Encoders.bean(Result.class)) .toJavaRDD() .map(p -> new AtomicAction(Result.class, p)) .mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), - new Text(OBJECT_MAPPER.writeValueAsString(aa)))) - .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); - + new Text(OBJECT_MAPPER.writeValueAsString(aa)))); } private static List getMeasure(BipScore value) { diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json index d6b93c5af..c472eb5e6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json @@ -6,9 +6,15 @@ "paramRequired": false }, { - "paramName": "ip", - "paramLongName": "inputPath", - "paramDescription": "the URL from where to get the programme file", + "paramName": "rip", + "paramLongName": "resultsInputPath", + "paramDescription": "the URL from where to get the input file for results", + "paramRequired": true + }, + { + "paramName": "pip", + "paramLongName": "projectsInputPath", + "paramDescription": "the URL from where to get the input file for projects", "paramRequired": true }, { @@ -16,11 +22,5 @@ "paramLongName": "outputPath", "paramDescription": "the path of the new ActionSet", "paramRequired": true - }, - { - "paramName": "te", - "paramLongName": "targetEntity", - "paramDescription": "the type of target entity to be enriched; currently supported one of { 'result', 'project' }", - "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java index 7752fbc27..542354836 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java @@ -7,6 +7,8 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import javax.xml.crypto.Data; + import org.apache.commons.io.FileUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -27,6 +29,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; @@ -38,9 +41,6 @@ public class SparkAtomicActionScoreJobTest { private static Path workingDir; - private final static String RESULT = "result"; - private final static String PROJECT = "project"; - private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJobTest.class); @BeforeAll @@ -72,50 +72,64 @@ public class SparkAtomicActionScoreJobTest { spark.stop(); } - private void runJob(String inputPath, String outputPath, String targetEntity) throws Exception { + private void runJob(String resultsInputPath, String projectsInputPath, String outputPath) throws Exception { SparkAtomicActionScoreJob .main( new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-inputPath", inputPath, + "-resultsInputPath", resultsInputPath, + "-projectsInputPath", projectsInputPath, "-outputPath", outputPath, - "-targetEntity", targetEntity, }); } @Test - void testResultScores() throws Exception { - final String targetEntity = RESULT; - String inputResultScores = getClass() + void testScores() throws Exception { + + String resultsInputPath = getClass() .getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json") .getPath(); - String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet"; + + String projectsInputPath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json") + .getPath(); + + String outputPath = workingDir.toString() + "/actionSet"; // execute the job to generate the action sets for result scores - runJob(inputResultScores, outputPath, targetEntity); + runJob(resultsInputPath, projectsInputPath, outputPath); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = sc + JavaRDD tmp = sc .sequenceFile(outputPath, Text.class, Text.class) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) - .map(aa -> ((Result) aa.getPayload())); + .map(aa -> ((OafEntity) aa.getPayload())); - assertEquals(4, tmp.count()); + assertEquals(8, tmp.count()); - Dataset verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class)); + Dataset verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(OafEntity.class)); verificationDataset.createOrReplaceTempView("result"); - Dataset execVerification = spark + Dataset testDataset = spark .sql( "Select p.id oaid, mes.id, mUnit.value from result p " + "lateral view explode(measures) m as mes " + "lateral view explode(mes.unit) u as mUnit "); - Assertions.assertEquals(12, execVerification.count()); +// execVerification.show(); + + Assertions.assertEquals(28, testDataset.count()); + + assertResultImpactScores(testDataset); + assertProjectImpactScores(testDataset); + + } + + void assertResultImpactScores(Dataset testDataset) { Assertions .assertEquals( - "6.63451994567e-09", execVerification + "6.63451994567e-09", testDataset .filter( "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " + "and id = 'influence'") @@ -125,7 +139,7 @@ public class SparkAtomicActionScoreJobTest { .getString(0)); Assertions .assertEquals( - "0.348694533145", execVerification + "0.348694533145", testDataset .filter( "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " + "and id = 'popularity_alt'") @@ -135,7 +149,7 @@ public class SparkAtomicActionScoreJobTest { .getString(0)); Assertions .assertEquals( - "2.16094680115e-09", execVerification + "2.16094680115e-09", testDataset .filter( "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " + "and id = 'popularity'") @@ -143,65 +157,49 @@ public class SparkAtomicActionScoreJobTest { .collectAsList() .get(0) .getString(0)); - } - @Test - void testProjectScores() throws Exception { - String targetEntity = PROJECT; - String inputResultScores = getClass() - .getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json") - .getPath(); - String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet"; + void assertProjectImpactScores(Dataset testDataset) throws Exception { - // execute the job to generate the action sets for project scores - runJob(inputResultScores, outputPath, PROJECT); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD projects = sc - .sequenceFile(outputPath, Text.class, Text.class) - .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) - .map(aa -> ((Project) aa.getPayload())); - - // test the number of projects - assertEquals(4, projects.count()); - - String testProjectId = "40|nih_________::c02a8233e9b60f05bb418f0c9b714833"; - - // count that the project with id testProjectId is present - assertEquals(1, projects.filter(row -> row.getId().equals(testProjectId)).count()); - - projects - .filter(row -> row.getId().equals(testProjectId)) - .flatMap(r -> r.getMeasures().iterator()) - .foreach(m -> { - log.info(m.getId() + " " + m.getUnit()); - - // ensure that only one score is present for each bip impact measure - assertEquals(1, m.getUnit().size()); - - KeyValue kv = m.getUnit().get(0); - - // ensure that the correct key is provided, i.e. score - assertEquals("score", kv.getKey()); - - switch (m.getId()) { - case "numOfInfluentialResults": - assertEquals("0", kv.getValue()); - break; - case "numOfPopularResults": - assertEquals("1", kv.getValue()); - break; - case "totalImpulse": - assertEquals("25", kv.getValue()); - break; - case "totalCitationCount": - assertEquals("43", kv.getValue()); - break; - default: - fail("Unknown measure id in the context of projects"); - } - }); + Assertions + .assertEquals( + "0", testDataset + .filter( + "oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " + + "and id = 'numOfInfluentialResults'") + .select("value") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "1", testDataset + .filter( + "oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " + + "and id = 'numOfPopularResults'") + .select("value") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "25", testDataset + .filter( + "oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " + + "and id = 'totalImpulse'") + .select("value") + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "43", testDataset + .filter( + "oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " + + "and id = 'totalCitationCount'") + .select("value") + .collectAsList() + .get(0) + .getString(0)); } } diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 349e054d8..c225fa3e1 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -31,7 +31,7 @@ ${wf:conf('resume') eq "projects-impact"} - ${wf:conf('resume') eq "projects-impact-actionsets"} + ${wf:conf('resume') eq "create-actionset"} @@ -455,53 +455,11 @@ ${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py - + - - - - - - - - - - - - - - - - - yarn-cluster - cluster - Produces the atomic action with the bip finder scores for publications - eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob - dhp-aggregation-${projectVersion}.jar - - - --executor-memory=${sparkNormalExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkNormalDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --inputPath${bipScorePath} - --outputPath${actionSetOutputPath}/results/ - --targetEntityresult - - - - - - - @@ -538,17 +496,26 @@ ${wfAppPath}/projects_impact.py#projects_impact.py - + - - + + + + + + + + + + + yarn-cluster cluster - Produces the atomic action with the bip finder scores for projects + Produces the atomic action with the bip finder scores eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob dhp-aggregation-${projectVersion}.jar @@ -563,14 +530,13 @@ --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${projectImpactIndicatorsOutput} - --outputPath${actionSetOutputPath}/projects/ - --targetEntityproject + --resultsInputPath${bipScorePath} + --projectsInputPath${projectImpactIndicatorsOutput} + --outputPath${actionSetOutputPath} - - + @@ -630,10 +596,6 @@ Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - -