From 86f4f63daf4722b712f5693776f50d1c0cb20ba6 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 18 May 2023 09:33:05 +0300 Subject: [PATCH] Updates to steps related to transfer data to impala cluster 1. Remove external table definitions in stats_ext 2. Fix the issue where some views are not created. 3. Added two workflow parameters for copying also the usage stats dbs --- .../oozie_app/copyDataToImpalaCluster.sh | 24 +++++++++++++------ .../stats/oozie_app/finalizeImpalaCluster.sh | 3 +++ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 11 +++++++++ 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 4ff236d07..66783c234 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -8,7 +8,7 @@ fi #export HADOOP_USER_NAME="dimitris.pierrakos" export HADOOP_USER_NAME=$5 - +export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" function copydb() { db=$1 FILE=("hive_wf_tmp_"$RANDOM) @@ -27,16 +27,23 @@ function copydb() { impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA" echo "creating schema for ${db}" + for (( k = 0; k < 5; k ++ )); do for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; do impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + done - # run the same command twice because we may have failures in the first run (due to views pointing to the same db) - for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; - do - impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - +# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; +# do +# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; +# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - +# +# # run the same command twice because we may have failures in the first run (due to views pointing to the same db) +# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; +# do +# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; +# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - # load the data from /tmp in the respective tables echo "copying data in tables and computing stats" @@ -54,8 +61,11 @@ STATS_DB=$1 MONITOR_DB=$2 OBSERVATORY_DB=$3 EXT_DB=$4 -HADOOP_USER_NAME=$5 +USAGE_STATS_DB=$5 +HADOOP_USER_NAME=$6 +copydb $USAGE_STATS_DB +copydb $PROD_USAGE_STATS_DB copydb $EXT_DB copydb $STATS_DB copydb $MONITOR_DB diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh index fedfa00af..5914b95f8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh @@ -22,7 +22,10 @@ MONITOR_DB=$3 MONITOR_DB_SHADOW=$4 OBSERVATORY_DB=$5 OBSERVATORY_DB_SHADOW=$6 +USAGE_STATS_DB=$7 +USAGE_STATS_DB_SHADOW=$8 createShadowDB $STATS_DB $STATS_DB_SHADOW createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW +createShadowDB USAGE_STATS_DB USAGE_STATS_DB_SHADOW diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 8d2e56380..68ef4595e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -12,6 +12,10 @@ external_stats_db_name the external stats that should be added since they are not included in the graph database + + usage_stats_db_name + the usage statistics database name + stats_db_shadow_name the name of the shadow schema @@ -32,6 +36,10 @@ observatory_db_shadow_name the name of the shadow monitor db + + usage_stats_db_shadow_name + the name of the shadow usage stats db + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache update. @@ -434,6 +442,7 @@ ${monitor_db_name} ${observatory_db_name} ${external_stats_db_name} + ${usage_stats_db_name} ${hadoop_user_name} copyDataToImpalaCluster.sh @@ -452,6 +461,8 @@ ${monitor_db_shadow_name} ${observatory_db_name} ${observatory_db_shadow_name} + ${usage_stats_db_name} + ${usage_stats_db_shadow_name} finalizeImpalaCluster.sh