From 0198362200ee6a220f37e59c8d757da3c7d31935 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 21 Feb 2024 16:52:28 +0200 Subject: [PATCH] Add documentation about the problems with views and set the number of iterations back to 5. --- .../stats/oozie_app/copyDataToImpalaCluster.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 989eeae84..043780c07 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -44,12 +44,12 @@ function copydb() { # Delete the old DB from Impala cluster. # drop tables from db for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited -q "show tables"`; do - `impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop table ${i};"`; + `impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop table ${i};"`; done # drop views from db for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited -q "show tables"`; do - `impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop view ${i};"`; + `impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop view ${i};"`; done # delete the database @@ -69,9 +69,15 @@ function copydb() { # The "2-times-loop" is there to retry creating the views for which their tables have not been created yet. # Since there are many DBs handled in this script and many more may be added, we cannot easily keep track of the views of all those tables, so leave this extra loop for now. - for (( k = 1; k <= 2; k++ )); do + # Ideally, we should use a "do.. while" loop, but bash does not support it, so we use an infinite "while" loop with a "break". + should_retry=0 # Should retry creating the views (in case their tables where not created before them). + # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. + # That's why there was a 5-times loop before.. and is added again temporarily.. + + for (( k = 1; k <= 5; k++ )); do # TODO - To be replaced by a while-loop. echo -e "\nCreate tables iteration_${k}\n" - for i in `hive -e "use $db; show tables;" | sed 's/WARN:.*//g'`; do + for i in `hive -e "use $db; show tables;" | sed 's/WARN:.*//g'`; do # This includes "views" as well.. + # TODO - A view will not have a parquet-file with its name and it would not help anyway. So, we need to find another way to know how the views are created.. CURRENT_PRQ_FILE=`hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -ls -C "/tmp/${TEMP_SUBDIR}/${db}.db/${i}/*.parq" | head -1` if [[ -z "$CURRENT_PRQ_FILE" ]]; then echo -e "The table \"${i}\" had no parquet files to get the schema from!\n"