Merge pull request 'changes in copy script - beta2master' (#439) from antonis.lempesis/dnet-hadoop:beta into beta_to_master_may2024
Reviewed-on: #439
This commit is contained in:
commit
b703f94f09
|
@ -8,6 +8,8 @@ fi
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
|
||||||
|
|
||||||
|
|
||||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
|
@ -30,8 +32,10 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
done
|
done
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||||
|
@ -39,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
# Set sed arguments.
|
# Set sed arguments.
|
||||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
# Set the SED command arguments for column-names with reserved words:
|
|
||||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
|
||||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
|
||||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
|
||||||
|
|
||||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
function print_elapsed_time()
|
||||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
{
|
||||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
start_time=$1
|
||||||
|
end_time=$(date +%s)
|
||||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
elapsed_time=$(($end_time-$start_time))
|
||||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
hours=$((elapsed_time / 3600))
|
||||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
minutes=$(((elapsed_time % 3600) / 60))
|
||||||
|
seconds=$((elapsed_time % 60))
|
||||||
|
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
start_db_time=$(date +%s)
|
||||||
echo -e "\nStart processing db: '${db}'..\n"
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
|
|
||||||
# Delete the old DB from Impala cluster (if exists).
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
|
@ -67,7 +70,11 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 2
|
exit 2
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
|
@ -85,17 +92,30 @@ function copydb() {
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# Check the exit status of the "hadoop distcp" command.
|
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
|
||||||
if [ $? -eq 0 ]; then
|
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 3
|
exit 3
|
||||||
|
else
|
||||||
|
return 3
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
|
||||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
|
||||||
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
|
||||||
|
if [ $? -ne 0 ]; then # Check the exit status..
|
||||||
|
echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 4
|
||||||
|
else
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo -e "\nCreating schema for db: '${db}'\n"
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
|
|
||||||
|
@ -109,17 +129,13 @@ function copydb() {
|
||||||
num_tables=0
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
|
||||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
|
||||||
if [ -n "$create_view_statement_test" ]; then
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
|
||||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
|
||||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
|
||||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
@ -127,12 +143,17 @@ function copydb() {
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -176,7 +197,9 @@ function copydb() {
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
exit 5
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
else
|
else
|
||||||
|
@ -204,11 +227,14 @@ function copydb() {
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
exit 6
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 8
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
echo -e "\n\nFinished processing db: ${db}\n"
|
||||||
|
print_elapsed_time start_db_time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,9 @@ fi
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
|
||||||
|
|
||||||
|
|
||||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||||
|
@ -29,8 +32,10 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
done
|
done
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||||
|
@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
# Set sed arguments.
|
# Set sed arguments.
|
||||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
# Set the SED command arguments for column-names with reserved words:
|
|
||||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
|
||||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
|
||||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
|
||||||
|
|
||||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
function print_elapsed_time()
|
||||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
{
|
||||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
start_time=$1
|
||||||
|
end_time=$(date +%s)
|
||||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
elapsed_time=$(($end_time-$start_time))
|
||||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
hours=$((elapsed_time / 3600))
|
||||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
minutes=$(((elapsed_time % 3600) / 60))
|
||||||
|
seconds=$((elapsed_time % 60))
|
||||||
|
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
start_db_time=$(date +%s)
|
||||||
echo -e "\nStart processing db: '${db}'..\n"
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
|
|
||||||
# Delete the old DB from Impala cluster (if exists).
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
|
@ -66,7 +70,11 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 2
|
exit 2
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
|
@ -84,17 +92,30 @@ function copydb() {
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# Check the exit status of the "hadoop distcp" command.
|
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
|
||||||
if [ $? -eq 0 ]; then
|
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 3
|
exit 3
|
||||||
|
else
|
||||||
|
return 3
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
|
||||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
|
||||||
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
|
||||||
|
if [ $? -ne 0 ]; then # Check the exit status..
|
||||||
|
echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 4
|
||||||
|
else
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo -e "\nCreating schema for db: '${db}'\n"
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
|
|
||||||
|
@ -108,17 +129,13 @@ function copydb() {
|
||||||
num_tables=0
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
|
||||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
|
||||||
if [ -n "$create_view_statement_test" ]; then
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
|
||||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
|
||||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
|
||||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
@ -126,12 +143,17 @@ function copydb() {
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -175,7 +197,9 @@ function copydb() {
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
exit 5
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
else
|
else
|
||||||
|
@ -203,11 +227,14 @@ function copydb() {
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
exit 6
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 8
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
echo -e "\n\nFinished processing db: ${db}\n"
|
||||||
|
print_elapsed_time start_db_time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,9 @@ fi
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
|
||||||
|
|
||||||
|
|
||||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||||
|
@ -29,8 +32,10 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
done
|
done
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||||
|
@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
# Set sed arguments.
|
# Set sed arguments.
|
||||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
# Set the SED command arguments for column-names with reserved words:
|
|
||||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
|
||||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
|
||||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
|
||||||
|
|
||||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
function print_elapsed_time()
|
||||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
{
|
||||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
start_time=$1
|
||||||
|
end_time=$(date +%s)
|
||||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
elapsed_time=$(($end_time-$start_time))
|
||||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
hours=$((elapsed_time / 3600))
|
||||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
minutes=$(((elapsed_time % 3600) / 60))
|
||||||
|
seconds=$((elapsed_time % 60))
|
||||||
|
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
start_db_time=$(date +%s)
|
||||||
echo -e "\nStart processing db: '${db}'..\n"
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
|
|
||||||
# Delete the old DB from Impala cluster (if exists).
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
|
@ -66,7 +70,11 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 2
|
exit 2
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
|
@ -84,17 +92,30 @@ function copydb() {
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# Check the exit status of the "hadoop distcp" command.
|
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
|
||||||
if [ $? -eq 0 ]; then
|
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 3
|
exit 3
|
||||||
|
else
|
||||||
|
return 3
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
|
||||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
|
||||||
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
|
||||||
|
if [ $? -ne 0 ]; then # Check the exit status..
|
||||||
|
echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 4
|
||||||
|
else
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo -e "\nCreating schema for db: '${db}'\n"
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
|
|
||||||
|
@ -108,17 +129,13 @@ function copydb() {
|
||||||
num_tables=0
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
|
||||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
|
||||||
if [ -n "$create_view_statement_test" ]; then
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
|
||||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
|
||||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
|
||||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
@ -126,12 +143,17 @@ function copydb() {
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -175,7 +197,9 @@ function copydb() {
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
exit 5
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
else
|
else
|
||||||
|
@ -203,11 +227,14 @@ function copydb() {
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
exit 6
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 8
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
echo -e "\n\nFinished processing db: ${db}\n"
|
||||||
|
print_elapsed_time start_db_time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,8 @@ then
|
||||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
|
||||||
|
|
||||||
|
|
||||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
|
@ -28,8 +30,10 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
done
|
done
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||||
|
@ -40,26 +44,26 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
# Set sed arguments.
|
# Set sed arguments.
|
||||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
# Set the SED command arguments for column-names with reserved words:
|
|
||||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
|
||||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
|
||||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
|
||||||
|
|
||||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
|
||||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
|
||||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
|
||||||
|
|
||||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
|
||||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
|
||||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
|
||||||
|
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$6
|
export HADOOP_USER_NAME=$6
|
||||||
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
||||||
|
|
||||||
|
|
||||||
|
function print_elapsed_time()
|
||||||
|
{
|
||||||
|
start_time=$1
|
||||||
|
end_time=$(date +%s)
|
||||||
|
elapsed_time=$(($end_time-$start_time))
|
||||||
|
hours=$((elapsed_time / 3600))
|
||||||
|
minutes=$(((elapsed_time % 3600) / 60))
|
||||||
|
seconds=$((elapsed_time % 60))
|
||||||
|
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
start_db_time=$(date +%s)
|
||||||
echo -e "\nStart processing db: '${db}'..\n"
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
|
|
||||||
# Delete the old DB from Impala cluster (if exists).
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
|
@ -68,7 +72,11 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 2
|
exit 2
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
|
@ -86,17 +94,30 @@ function copydb() {
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# Check the exit status of the "hadoop distcp" command.
|
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
|
||||||
if [ $? -eq 0 ]; then
|
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 3
|
exit 3
|
||||||
|
else
|
||||||
|
return 3
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
|
||||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
|
||||||
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
|
||||||
|
if [ $? -ne 0 ]; then # Check the exit status..
|
||||||
|
echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 4
|
||||||
|
else
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo -e "\nCreating schema for db: '${db}'\n"
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
|
|
||||||
|
@ -110,17 +131,13 @@ function copydb() {
|
||||||
num_tables=0
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
|
||||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
|
||||||
if [ -n "$create_view_statement_test" ]; then
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
|
||||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
|
||||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
|
||||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
@ -128,12 +145,17 @@ function copydb() {
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -177,7 +199,9 @@ function copydb() {
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
exit 5
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
else
|
else
|
||||||
|
@ -205,11 +229,14 @@ function copydb() {
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
exit 6
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 8
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
echo -e "\n\nFinished processing db: ${db}\n"
|
||||||
|
print_elapsed_time start_db_time
|
||||||
}
|
}
|
||||||
|
|
||||||
STATS_DB=$1
|
STATS_DB=$1
|
||||||
|
|
|
@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as
|
||||||
with
|
with
|
||||||
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
|
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
|
||||||
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
|
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
|
||||||
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
|
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'),
|
||||||
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
|
lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
|
||||||
|
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
|
||||||
from lvl1
|
from lvl1
|
||||||
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
|
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
|
||||||
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
|
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
|
||||||
|
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
|
||||||
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
|
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue