Merge pull request 'Miscellaneous updates to the copying operation to Impala Cluster.' (#447) from antonis.lempesis/dnet-hadoop:beta into beta

Reviewed-on: D-Net/dnet-hadoop#447
This commit is contained in:
Claudio Atzori 2024-06-18 15:52:30 +02:00
commit dd541f8cf5
5 changed files with 146 additions and 93 deletions

View File

@ -32,9 +32,7 @@ while [ $COUNTER -lt 3 ]; do
done done
if [ -z "$IMPALA_HDFS_NODE" ]; then if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 1 # This is fatal and we have to exit independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config, as none of the DBs will be able to get transferred.
exit 1
fi
fi fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -55,20 +53,20 @@ function print_elapsed_time()
hours=$((elapsed_time / 3600)) hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60)) minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60)) seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds printf "%02d:%02d:%02d" $hours $minutes $seconds
} }
function copydb() { function copydb() {
db=$1 db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n" echo -e "\nStart processing db: '${db}'..\n"
start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2 exit 2
@ -78,6 +76,7 @@ function copydb() {
fi fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
start_file_transfer_time=$(date +%s)
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
# Using max memory of: 70 * 6144 = 430 Gb # Using max memory of: 70 * 6144 = 430 Gb
# Using 1MB as a buffer-size. # Using 1MB as a buffer-size.
@ -93,7 +92,7 @@ function copydb() {
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster, after: $(print_elapsed_time start_file_transfer_time)\n"
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
@ -118,6 +117,7 @@ function copydb() {
fi fi
echo -e "\nCreating schema for db: '${db}'\n" echo -e "\nCreating schema for db: '${db}'\n"
start_create_schema_time=$(date +%s)
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
@ -128,7 +128,8 @@ function copydb() {
all_create_view_statements=() all_create_view_statements=()
num_tables=0 num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=(`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`) # Get the tables and views without any potential the "WARN" logs.
echo -e "\nGoing to create ${#entities_on_ocean[@]} entities for db '${db}'..\n"
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
@ -145,15 +146,16 @@ function copydb() {
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5 exit 5
fi fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
rm -f error.log
exit 6 exit 6
fi fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
fi fi
fi fi
fi fi
@ -184,7 +186,7 @@ function copydb() {
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" echo -e "\nView '$(cat error.log | grep -Po "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
else else
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
@ -196,9 +198,11 @@ function copydb() {
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! BREAKING-OUT..\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7 exit 7
else
break # Break form the inf-loop of views and continue by computing stats for the tables.
fi fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@ -208,8 +212,11 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s)
echo -e "\nComputing stats for tables..\n" echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
@ -217,24 +224,30 @@ function copydb() {
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1 sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
rm -f error.log
exit 8
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
fi
fi fi
done done
echo -e "\nFinished computing stats for tables, after: $(print_elapsed_time start_compute_stats_time)\n"
rm -f error.log # Cleanup the temp log-file.
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then if [[ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]]; then
echo -e "\nAll entities have been copied to Impala cluster.\n" echo -e "\nAll entities have been copied to Impala cluster.\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n"
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 8 exit 9
fi fi
fi fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
} }

View File

@ -32,9 +32,7 @@ while [ $COUNTER -lt 3 ]; do
done done
if [ -z "$IMPALA_HDFS_NODE" ]; then if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 1 # This is fatal and we have to exit independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config, as none of the DBs will be able to get transferred.
exit 1
fi
fi fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -55,20 +53,20 @@ function print_elapsed_time()
hours=$((elapsed_time / 3600)) hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60)) minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60)) seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds printf "%02d:%02d:%02d" $hours $minutes $seconds
} }
function copydb() { function copydb() {
db=$1 db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n" echo -e "\nStart processing db: '${db}'..\n"
start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2 exit 2
@ -78,6 +76,7 @@ function copydb() {
fi fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
start_file_transfer_time=$(date +%s)
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
# Using max memory of: 70 * 6144 = 430 Gb # Using max memory of: 70 * 6144 = 430 Gb
# Using 1MB as a buffer-size. # Using 1MB as a buffer-size.
@ -93,7 +92,7 @@ function copydb() {
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster, after: $(print_elapsed_time start_file_transfer_time)\n"
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
@ -118,6 +117,7 @@ function copydb() {
fi fi
echo -e "\nCreating schema for db: '${db}'\n" echo -e "\nCreating schema for db: '${db}'\n"
start_create_schema_time=$(date +%s)
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
@ -128,7 +128,8 @@ function copydb() {
all_create_view_statements=() all_create_view_statements=()
num_tables=0 num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=(`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`) # Get the tables and views without any potential the "WARN" logs.
echo -e "\nGoing to create ${#entities_on_ocean[@]} entities for db '${db}'..\n"
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
@ -145,15 +146,16 @@ function copydb() {
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5 exit 5
fi fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
rm -f error.log
exit 6 exit 6
fi fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
fi fi
fi fi
fi fi
@ -184,7 +186,7 @@ function copydb() {
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" echo -e "\nView '$(cat error.log | grep -Po "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
else else
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
@ -196,9 +198,11 @@ function copydb() {
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! BREAKING-OUT..\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7 exit 7
else
break # Break form the inf-loop of views and continue by computing stats for the tables.
fi fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@ -208,8 +212,11 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s)
echo -e "\nComputing stats for tables..\n" echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
@ -217,24 +224,30 @@ function copydb() {
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1 sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
rm -f error.log
exit 8
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
fi
fi fi
done done
echo -e "\nFinished computing stats for tables, after: $(print_elapsed_time start_compute_stats_time)\n"
rm -f error.log # Cleanup the temp log-file.
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then if [[ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]]; then
echo -e "\nAll entities have been copied to Impala cluster.\n" echo -e "\nAll entities have been copied to Impala cluster.\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n"
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 8 exit 9
fi fi
fi fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
} }

View File

@ -32,9 +32,7 @@ while [ $COUNTER -lt 3 ]; do
done done
if [ -z "$IMPALA_HDFS_NODE" ]; then if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 1 # This is fatal and we have to exit independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config, as none of the DBs will be able to get transferred.
exit 1
fi
fi fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -55,20 +53,20 @@ function print_elapsed_time()
hours=$((elapsed_time / 3600)) hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60)) minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60)) seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds printf "%02d:%02d:%02d" $hours $minutes $seconds
} }
function copydb() { function copydb() {
db=$1 db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n" echo -e "\nStart processing db: '${db}'..\n"
start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2 exit 2
@ -78,6 +76,7 @@ function copydb() {
fi fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
start_file_transfer_time=$(date +%s)
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
# Using max memory of: 70 * 6144 = 430 Gb # Using max memory of: 70 * 6144 = 430 Gb
# Using 1MB as a buffer-size. # Using 1MB as a buffer-size.
@ -93,7 +92,7 @@ function copydb() {
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster, after: $(print_elapsed_time start_file_transfer_time)\n"
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
@ -118,6 +117,7 @@ function copydb() {
fi fi
echo -e "\nCreating schema for db: '${db}'\n" echo -e "\nCreating schema for db: '${db}'\n"
start_create_schema_time=$(date +%s)
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
@ -128,7 +128,8 @@ function copydb() {
all_create_view_statements=() all_create_view_statements=()
num_tables=0 num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=(`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`) # Get the tables and views without any potential the "WARN" logs.
echo -e "\nGoing to create ${#entities_on_ocean[@]} entities for db '${db}'..\n"
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
@ -145,15 +146,16 @@ function copydb() {
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5 exit 5
fi fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
rm -f error.log
exit 6 exit 6
fi fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
fi fi
fi fi
fi fi
@ -184,7 +186,7 @@ function copydb() {
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" echo -e "\nView '$(cat error.log | grep -Po "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
else else
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
@ -196,9 +198,11 @@ function copydb() {
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! BREAKING-OUT..\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7 exit 7
else
break # Break form the inf-loop of views and continue by computing stats for the tables.
fi fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@ -208,8 +212,11 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s)
echo -e "\nComputing stats for tables..\n" echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
@ -217,24 +224,30 @@ function copydb() {
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1 sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
rm -f error.log
exit 8
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
fi
fi fi
done done
echo -e "\nFinished computing stats for tables, after: $(print_elapsed_time start_compute_stats_time)\n"
rm -f error.log # Cleanup the temp log-file.
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then if [[ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]]; then
echo -e "\nAll entities have been copied to Impala cluster.\n" echo -e "\nAll entities have been copied to Impala cluster.\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n"
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 8 exit 9
fi fi
fi fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
} }

View File

@ -30,9 +30,7 @@ while [ $COUNTER -lt 3 ]; do
done done
if [ -z "$IMPALA_HDFS_NODE" ]; then if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 1 # This is fatal and we have to exit independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config, as none of the DBs will be able to get transferred.
exit 1
fi
fi fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -57,20 +55,20 @@ function print_elapsed_time()
hours=$((elapsed_time / 3600)) hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60)) minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60)) seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds printf "%02d:%02d:%02d" $hours $minutes $seconds
} }
function copydb() { function copydb() {
db=$1 db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n" echo -e "\nStart processing db: '${db}'..\n"
start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2 exit 2
@ -80,6 +78,7 @@ function copydb() {
fi fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
start_file_transfer_time=$(date +%s)
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
# Using max memory of: 70 * 6144 = 430 Gb # Using max memory of: 70 * 6144 = 430 Gb
# Using 1MB as a buffer-size. # Using 1MB as a buffer-size.
@ -95,7 +94,7 @@ function copydb() {
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster, after: $(print_elapsed_time start_file_transfer_time)\n"
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
@ -120,6 +119,7 @@ function copydb() {
fi fi
echo -e "\nCreating schema for db: '${db}'\n" echo -e "\nCreating schema for db: '${db}'\n"
start_create_schema_time=$(date +%s)
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
@ -130,7 +130,8 @@ function copydb() {
all_create_view_statements=() all_create_view_statements=()
num_tables=0 num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=(`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`) # Get the tables and views without any potential the "WARN" logs.
echo -e "\nGoing to create ${#entities_on_ocean[@]} entities for db '${db}'..\n"
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
@ -147,15 +148,16 @@ function copydb() {
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5 exit 5
fi fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
rm -f error.log
exit 6 exit 6
fi fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
fi fi
fi fi
fi fi
@ -186,7 +188,7 @@ function copydb() {
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" echo -e "\nView '$(cat error.log | grep -Po "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
else else
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
@ -198,9 +200,11 @@ function copydb() {
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! BREAKING-OUT..\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7 exit 7
else
break # Break form the inf-loop of views and continue by computing stats for the tables.
fi fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@ -210,8 +214,11 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s)
echo -e "\nComputing stats for tables..\n" echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
@ -219,24 +226,30 @@ function copydb() {
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1 sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
rm -f error.log
exit 8
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
fi
fi fi
done done
echo -e "\nFinished computing stats for tables, after: $(print_elapsed_time start_compute_stats_time)\n"
rm -f error.log # Cleanup the temp log-file.
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then if [[ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]]; then
echo -e "\nAll entities have been copied to Impala cluster.\n" echo -e "\nAll entities have been copied to Impala cluster.\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n"
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 8 exit 9
fi fi
fi fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
} }
STATS_DB=$1 STATS_DB=$1

View File

@ -65,4 +65,5 @@ DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE;