From 0bf2a7a359e7613226d42cba3eb46cdba8aa3e4c Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 1 Apr 2024 15:23:22 +0300 Subject: [PATCH 1/3] fixed the result_country definition --- .../scripts/step16-createIndicatorsTables.sql | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index e614ffcbd..73f215ef0 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1000,13 +1000,16 @@ left outer join ( drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ create table ${stats_db_name}.result_country stored as parquet as -select distinct ro.id, coalesce(o.country, f.country) as country -from ${stats_db_name}.result_organization ro -left outer join ${stats_db_name}.organization o on o.id=ro.organization -left outer join ${stats_db_name}.result_projects rp on rp.id=ro.id -left outer join ${stats_db_name}.project p on p.id=rp.project -left outer join ${stats_db_name}.funder f on f.name=p.funder -where coalesce(o.country, f.country) IS NOT NULL; /*EOS*/ +select distinct * from ( + select ro.id, o.country as country + from ${stats_db_name}.result_organization ro + join ${stats_db_name}.organization o on o.id=ro.organization + union all + select rp.id, f.country as country + from ${stats_db_name}.result_projects rp + left outer join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.funder f on f.name=p.funder ) u +where u.country is not NULL; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as From 573b081f1df96d10a6fd9cc3044fbe8d5f6a0c1b Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 1 Apr 2024 22:24:46 +0300 Subject: [PATCH 2/3] added new orgs in monitor --- .../stats/oozie_app/scripts/step20-createMonitorDBAll.sql | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index b4f87a184..a8392b226 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -81,7 +81,11 @@ create table TARGET.result stored as parquet as 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna - 'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden + 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden + 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna + 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology + 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University + 'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications ) )) foo; create view if not exists TARGET.category as select * from SOURCE.category; From b7c8acc563de17ef2d3be13ff5fc085e5eec4815 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 3 Apr 2024 13:15:37 +0300 Subject: [PATCH 3/3] - Update the code which acquires the "IMPALA_HDFS_NODE", to test the "tmp"-dir, instead of the base-dir and introduce retries, to overcome potential file-system failures. This change was suggested by "Sebastian Tymkow" and "Grzegorz Bakalarski". - Fix typos. --- .../oozie_app/copyDataToImpalaCluster.sh | 26 +++++++++++++----- .../oozie_app/copyDataToImpalaCluster.sh | 27 ++++++++++++++----- .../oozie_app/copyDataToImpalaCluster.sh | 27 ++++++++++++++----- .../oozie_app/copyDataToImpalaCluster.sh | 27 ++++++++++++++----- .../dhp/oa/graph/stats/oozie_app/monitor.sh | 4 +-- 5 files changed, 81 insertions(+), 30 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index ef80d0094..6250aca81 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -9,15 +9,27 @@ fi export HADOOP_USER_NAME=$2 IMPALA_HDFS_NODE='' -if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then - IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' -elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then - IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' -else - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER!\n\n" +COUNTER=0 + +while [ $COUNTER -lt 3 ]; do + if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then + IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' + break + elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then + IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' + break + else + IMPALA_HDFS_NODE='' + sleep 1 + fi + ((COUNTER++)) +done + +if [ -z "$IMPALA_HDFS_NODE" ]; then + echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}" +echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." function copydb() { diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index e7d183ddb..97fa0dd9c 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -9,15 +9,28 @@ fi export HADOOP_USER_NAME=$2 IMPALA_HDFS_NODE='' -if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then - IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' -elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then - IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' -else - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER!\n\n" +COUNTER=0 + +while [ $COUNTER -lt 3 ]; do + if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then + IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' + break + elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then + IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' + break + else + IMPALA_HDFS_NODE='' + sleep 1 + fi + ((COUNTER++)) +done + +if [ -z "$IMPALA_HDFS_NODE" ]; then + echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}" +echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." + function copydb() { diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 148d9b0b6..81ac088c0 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -9,15 +9,28 @@ fi #export HADOOP_USER_NAME=$2 IMPALA_HDFS_NODE='' -if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then - IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' -elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then - IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' -else - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER!\n\n" +COUNTER=0 + +while [ $COUNTER -lt 3 ]; do + if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then + IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' + break + elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then + IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' + break + else + IMPALA_HDFS_NODE='' + sleep 1 + fi + ((COUNTER++)) +done + +if [ -z "$IMPALA_HDFS_NODE" ]; then + echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}" +echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." + function copydb() { diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 1996c0b03..3f8447b6c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -7,15 +7,28 @@ then fi IMPALA_HDFS_NODE='' -if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then - IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' -elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then - IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' -else - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER!\n\n" +COUNTER=0 + +while [ $COUNTER -lt 3 ]; do + if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then + IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' + break + elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then + IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' + break + else + IMPALA_HDFS_NODE='' + sleep 1 + fi + ((COUNTER++)) +done + +if [ -z "$IMPALA_HDFS_NODE" ]; then + echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}" +echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." + export HADOOP_USER_NAME=$6 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index 872456973..a5b6a54cb 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -85,12 +85,12 @@ hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/ hive -f foo echo "Updated shadow monitor funded database" -echo "Updating shadow monitor insitutions database" +echo "Updating shadow monitor institutions database" hive -e "drop database if exists ${SHADOW}_institutions cascade" hive -e "create database if not exists ${SHADOW}_institutions" hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo hive -f foo -echo "Shadow db monitor insitutions ready!" +echo "Shadow db monitor institutions ready!" echo "Updating shadow monitor RIs database" for i in $contexts