Small updates to the copy-operation to Impala Cluster:

- Add a configuration-"switch" to control whether the script exits upon an error or not.
- Allow the script to exit when a table could not be created.
- Show the elapsed time for processing each database.
This commit is contained in:
Lampros Smyrnaios 2024-05-23 15:07:49 +03:00
parent c7b32bbacc
commit 68322843e2
4 changed files with 153 additions and 29 deletions

View File

@ -8,6 +8,7 @@ fi
export HADOOP_USER_NAME=$2
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
@ -30,8 +31,10 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 1
fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
@ -43,8 +46,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
function print_elapsed_time()
{
start_time=$1
end_time=$(date +%s)
elapsed_time=$(($end_time-$start_time))
hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
}
function copydb() {
db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@ -53,8 +69,10 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2
fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -77,8 +95,10 @@ function copydb() {
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 3
fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -109,12 +129,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 4
fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5
fi
fi
fi
fi
@ -158,7 +183,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
exit 5
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 6
fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@ -186,11 +213,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
exit 6
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7
fi
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
}

View File

@ -8,6 +8,9 @@ fi
export HADOOP_USER_NAME=$2
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@ -29,8 +32,10 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 1
fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
@ -42,8 +47,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
function print_elapsed_time()
{
start_time=$1
end_time=$(date +%s)
elapsed_time=$(($end_time-$start_time))
hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
}
function copydb() {
db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@ -52,8 +70,10 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2
fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -76,8 +96,10 @@ function copydb() {
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 3
fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -108,12 +130,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 4
fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5
fi
fi
fi
fi
@ -157,7 +184,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
exit 5
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 6
fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@ -185,11 +214,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
exit 6
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7
fi
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
}

View File

@ -8,6 +8,8 @@ fi
export HADOOP_USER_NAME=$2
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@ -29,8 +31,10 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 1
fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
@ -42,8 +46,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
function print_elapsed_time()
{
start_time=$1
end_time=$(date +%s)
elapsed_time=$(($end_time-$start_time))
hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
}
function copydb() {
db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@ -52,8 +69,10 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2
fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -76,8 +95,10 @@ function copydb() {
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 3
fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -108,12 +129,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 4
fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5
fi
fi
fi
fi
@ -157,7 +183,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
exit 5
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 6
fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@ -185,11 +213,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
exit 6
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7
fi
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
}

View File

@ -6,6 +6,8 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
@ -28,8 +30,10 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 1
fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
@ -45,8 +49,21 @@ export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
function print_elapsed_time()
{
start_time=$1
end_time=$(date +%s)
elapsed_time=$(($end_time-$start_time))
hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
}
function copydb() {
db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@ -55,8 +72,10 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2
fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -79,8 +98,10 @@ function copydb() {
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 3
fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -111,12 +132,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 4
fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5
fi
fi
fi
fi
@ -160,7 +186,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
exit 5
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 6
fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@ -188,11 +216,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
exit 6
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7
fi
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
}
STATS_DB=$1