From d7da4f814ba17f71e09c047c0de8cbcd90c481b8 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 12 Apr 2024 18:12:06 +0300 Subject: [PATCH] Minor updates to the copying operation to Impala Cluster: - Improve logging. - Code optimization/polishing. --- .../oozie_app/copyDataToImpalaCluster.sh | 83 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 83 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 83 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 83 +++++++++---------- 4 files changed, 160 insertions(+), 172 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index fceb1b76b8..3d9986b64f 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -11,7 +11,7 @@ export HADOOP_USER_NAME=$2 # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' -echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" +echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" IMPALA_HDFS_NODE='' COUNTER=0 @@ -29,10 +29,10 @@ while [ $COUNTER -lt 3 ]; do ((COUNTER++)) done if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" + echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' @@ -59,12 +59,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' function copydb() { db=$1 + echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log return 1 fi @@ -73,7 +74,7 @@ function copydb() { sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo "Copying $db files from Ocean to Impala cluster.." + echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max memory of: 50 * 6144 = 300 Gb # Using 1MB as a buffer-size. @@ -89,9 +90,9 @@ function copydb() { # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then - echo "Successfully copied the files of '${db}'." + echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" rm -f error.log return 2 fi @@ -99,7 +100,7 @@ function copydb() { # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - echo "Creating schema for ${db}" + echo -e "\nCreating schema for db: '${db}'\n" # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -111,31 +112,31 @@ function copydb() { # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - all_create_view_commands=() + all_create_view_statements=() entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. - # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. - create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. + create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` - if [ -n "$create_view_command_test" ]; then - echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" - create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + if [ -n "$create_view_statement_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` - all_create_view_commands+=("$create_view_command") + all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. - echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" fi fi fi @@ -143,65 +144,62 @@ function copydb() { echo -e "\nAll tables have been created, going to create the views..\n" - # Make Impala aware of the new tables. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - # Time to loop through the views and create them. # At this point all table-schemas should have been created. - previous_num_of_views_to_retry=${#all_create_view_commands} + previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG + # Make Impala aware of the new tables, so it knows them when creating the views. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 else - echo -e "\nDB '${db}' does not contain views.\n" + echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_commands=() + should_retry_create_view_statements=() - for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_commands+=("$create_view_command") + should_retry_create_view_statements+=("$create_view_statement") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_commands} + new_num_of_views_to_retry=${#should_retry_create_view_statements} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" return 3 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry else - echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" + echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" sleep 1 - echo "Computing stats for tables.." - + echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` - for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. - create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. - if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done @@ -209,13 +207,12 @@ function copydb() { if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else - echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log return 4 fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" } diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 7ff6a5d526..2711d6e12b 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -10,7 +10,7 @@ export HADOOP_USER_NAME=$2 # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' -echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" +echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" IMPALA_HDFS_NODE='' COUNTER=0 @@ -28,10 +28,10 @@ while [ $COUNTER -lt 3 ]; do ((COUNTER++)) done if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" + echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' @@ -58,12 +58,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' function copydb() { db=$1 + echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log return 1 fi @@ -72,7 +73,7 @@ function copydb() { sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo "Copying $db files from Ocean to Impala cluster.." + echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max memory of: 50 * 6144 = 300 Gb # Using 1MB as a buffer-size. @@ -88,9 +89,9 @@ function copydb() { # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then - echo "Successfully copied the files of '${db}'." + echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" rm -f error.log return 2 fi @@ -98,7 +99,7 @@ function copydb() { # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - echo "Creating schema for ${db}" + echo -e "\nCreating schema for db: '${db}'\n" # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -110,31 +111,31 @@ function copydb() { # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - all_create_view_commands=() + all_create_view_statements=() entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. - # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. - create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. + create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` - if [ -n "$create_view_command_test" ]; then - echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" - create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + if [ -n "$create_view_statement_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` - all_create_view_commands+=("$create_view_command") + all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. - echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" fi fi fi @@ -142,65 +143,62 @@ function copydb() { echo -e "\nAll tables have been created, going to create the views..\n" - # Make Impala aware of the new tables. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - # Time to loop through the views and create them. # At this point all table-schemas should have been created. - previous_num_of_views_to_retry=${#all_create_view_commands} + previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG + # Make Impala aware of the new tables, so it knows them when creating the views. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 else - echo -e "\nDB '${db}' does not contain views.\n" + echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_commands=() + should_retry_create_view_statements=() - for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_commands+=("$create_view_command") + should_retry_create_view_statements+=("$create_view_statement") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_commands} + new_num_of_views_to_retry=${#should_retry_create_view_statements} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" return 3 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry else - echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" + echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" sleep 1 - echo "Computing stats for tables.." - + echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` - for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. - create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. - if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done @@ -208,13 +206,12 @@ function copydb() { if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else - echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log return 4 fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" } diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 8900adcb5e..5ad9df762f 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -10,7 +10,7 @@ export HADOOP_USER_NAME=$2 # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' -echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" +echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" IMPALA_HDFS_NODE='' COUNTER=0 @@ -28,10 +28,10 @@ while [ $COUNTER -lt 3 ]; do ((COUNTER++)) done if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" + echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' @@ -58,12 +58,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' function copydb() { db=$1 + echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log return 1 fi @@ -72,7 +73,7 @@ function copydb() { sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo "Copying $db files from Ocean to Impala cluster.." + echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max memory of: 50 * 6144 = 300 Gb # Using 1MB as a buffer-size. @@ -88,9 +89,9 @@ function copydb() { # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then - echo "Successfully copied the files of '${db}'." + echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" rm -f error.log return 2 fi @@ -98,7 +99,7 @@ function copydb() { # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - echo "Creating schema for ${db}" + echo -e "\nCreating schema for db: '${db}'\n" # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -110,31 +111,31 @@ function copydb() { # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - all_create_view_commands=() + all_create_view_statements=() entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. - # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. - create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. + create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` - if [ -n "$create_view_command_test" ]; then - echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" - create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + if [ -n "$create_view_statement_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` - all_create_view_commands+=("$create_view_command") + all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. - echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" fi fi fi @@ -142,65 +143,62 @@ function copydb() { echo -e "\nAll tables have been created, going to create the views..\n" - # Make Impala aware of the new tables. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - # Time to loop through the views and create them. # At this point all table-schemas should have been created. - previous_num_of_views_to_retry=${#all_create_view_commands} + previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG + # Make Impala aware of the new tables, so it knows them when creating the views. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 else - echo -e "\nDB '${db}' does not contain views.\n" + echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_commands=() + should_retry_create_view_statements=() - for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_commands+=("$create_view_command") + should_retry_create_view_statements+=("$create_view_statement") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_commands} + new_num_of_views_to_retry=${#should_retry_create_view_statements} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" return 3 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry else - echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" + echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" sleep 1 - echo "Computing stats for tables.." - + echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` - for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. - create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. - if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done @@ -208,13 +206,12 @@ function copydb() { if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else - echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log return 4 fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" } diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index bbb5e43eee..c2324b9124 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -9,7 +9,7 @@ fi # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' -echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" +echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" IMPALA_HDFS_NODE='' COUNTER=0 @@ -27,10 +27,10 @@ while [ $COUNTER -lt 3 ]; do ((COUNTER++)) done if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" + echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' @@ -60,12 +60,13 @@ export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" function copydb() { db=$1 + echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log return 1 fi @@ -74,7 +75,7 @@ function copydb() { sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo "Copying $db files from Ocean to Impala cluster.." + echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max memory of: 50 * 6144 = 300 Gb # Using 1MB as a buffer-size. @@ -90,9 +91,9 @@ function copydb() { # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then - echo "Successfully copied the files of '${db}'." + echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" rm -f error.log return 2 fi @@ -100,7 +101,7 @@ function copydb() { # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - echo "Creating schema for ${db}" + echo -e "\nCreating schema for db: '${db}'\n" # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -112,31 +113,31 @@ function copydb() { # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - all_create_view_commands=() + all_create_view_statements=() entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. - # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. - create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. + create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` - if [ -n "$create_view_command_test" ]; then - echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" - create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + if [ -n "$create_view_statement_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` - all_create_view_commands+=("$create_view_command") + all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. - echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" fi fi fi @@ -144,65 +145,62 @@ function copydb() { echo -e "\nAll tables have been created, going to create the views..\n" - # Make Impala aware of the new tables. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - # Time to loop through the views and create them. # At this point all table-schemas should have been created. - previous_num_of_views_to_retry=${#all_create_view_commands} + previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG + # Make Impala aware of the new tables, so it knows them when creating the views. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 else - echo -e "\nDB '${db}' does not contain views.\n" + echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_commands=() + should_retry_create_view_statements=() - for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_commands+=("$create_view_command") + should_retry_create_view_statements+=("$create_view_statement") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_commands} + new_num_of_views_to_retry=${#should_retry_create_view_statements} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" return 3 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry else - echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" + echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" sleep 1 - echo "Computing stats for tables.." - + echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` - for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. - create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. - if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done @@ -210,13 +208,12 @@ function copydb() { if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else - echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log return 4 fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" }