Improvements to copying data from ocean to impala #420
|
@ -8,9 +8,13 @@ fi
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
|
||||||
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
|
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||||
|
|
||||||
IMPALA_HDFS_NODE=''
|
IMPALA_HDFS_NODE=''
|
||||||
COUNTER=0
|
COUNTER=0
|
||||||
|
|
||||||
while [ $COUNTER -lt 3 ]; do
|
while [ $COUNTER -lt 3 ]; do
|
||||||
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
||||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
||||||
|
@ -24,71 +28,195 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
fi
|
fi
|
||||||
((COUNTER++))
|
((COUNTER++))
|
||||||
done
|
done
|
||||||
|
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||||
|
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
|
# Set sed arguments.
|
||||||
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
|
# Set the SED command arguments for column-names with reserved words:
|
||||||
|
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
||||||
|
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
||||||
|
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
||||||
|
|
||||||
|
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
||||||
|
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
||||||
|
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
||||||
|
|
||||||
|
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
||||||
|
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
||||||
|
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
|
|
||||||
|
|
||||||
db=$1
|
db=$1
|
||||||
FILE=("hive_wf_tmp_"$RANDOM)
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/
|
|
||||||
|
|
||||||
# change ownership to impala
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
if [ -n "$log_errors" ]; then
|
||||||
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make Impala aware of the deletion of the old DB immediately.
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
|
||||||
# copy the databases from ocean to impala
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
echo "copying $db"
|
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/
|
# Using max memory of: 50 * 6144 = 300 Gb
|
||||||
|
# Using 1MB as a buffer-size.
|
||||||
|
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||||
|
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||||
|
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||||
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||||
|
-numListstatusThreads 40 \
|
||||||
|
-copybuffersize 1048576 \
|
||||||
|
-strategy dynamic \
|
||||||
|
-pb \
|
||||||
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
# Check the exit status of the "hadoop distcp" command.
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||||
|
else
|
||||||
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
# drop tables from db
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
||||||
do
|
|
||||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
|
|
||||||
done
|
|
||||||
|
|
||||||
# drop views from db
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
|
||||||
do
|
|
||||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
|
|
||||||
done
|
|
||||||
|
|
||||||
# delete the database
|
# create the new database (with the same name)
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||||
|
|
||||||
# create the databases
|
# Make Impala aware of the creation of the new DB immediately.
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||||
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||||
|
|
||||||
impala-shell -q "INVALIDATE METADATA"
|
all_create_view_statements=()
|
||||||
echo "creating schema for ${db}"
|
|
||||||
for (( k = 0; k < 5; k ++ )); do
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in `impala-shell -d ${db} --delimited -q "show tables"`;
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
do
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
impala-shell -d ${db} --delimited -q "show create table $i";
|
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
||||||
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
||||||
|
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
||||||
|
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
||||||
|
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
||||||
|
all_create_view_statements+=("$create_view_statement")
|
||||||
|
else
|
||||||
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
|
else
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
if [ -n "$log_errors" ]; then
|
||||||
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# load the data from /tmp in the respective tables
|
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||||
echo "copying data in tables and computing stats"
|
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
|
||||||
do
|
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
|
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
|
|
||||||
done
|
|
||||||
|
|
||||||
# deleting the remaining directory from hdfs
|
# Time to loop through the views and create them.
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
|
# At this point all table-schemas should have been created.
|
||||||
|
|
||||||
|
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||||
|
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||||
|
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||||
|
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||||
|
fi
|
||||||
|
|
||||||
|
level_counter=0
|
||||||
|
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||||
|
((level_counter++))
|
||||||
|
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||||
|
# In this case, we should retry creating this particular view again.
|
||||||
|
should_retry_create_view_statements=()
|
||||||
|
|
||||||
|
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
|
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||||
|
if [ -n "$specific_errors" ]; then
|
||||||
|
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||||
|
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||||
|
should_retry_create_view_statements+=("$create_view_statement")
|
||||||
|
else
|
||||||
|
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||||
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
|
return 3
|
||||||
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
|
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||||
|
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||||
|
else
|
||||||
|
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||||
|
fi
|
||||||
|
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||||
|
done
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
|
||||||
|
echo -e "\nComputing stats for tables..\n"
|
||||||
|
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||||
|
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||||
|
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||||
|
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||||
|
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||||
|
else
|
||||||
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f error.log
|
||||||
|
echo -e "\n\nFinished processing db: ${db}\n\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
MONITOR_DB=$1
|
MONITOR_DB=$1
|
||||||
#HADOOP_USER_NAME=$2
|
#HADOOP_USER_NAME=$2
|
||||||
copydb $MONITOR_DB
|
copydb $MONITOR_DB
|
||||||
|
|
|
@ -8,9 +8,12 @@ fi
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
|
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||||
|
|
||||||
IMPALA_HDFS_NODE=''
|
IMPALA_HDFS_NODE=''
|
||||||
COUNTER=0
|
COUNTER=0
|
||||||
|
|
||||||
while [ $COUNTER -lt 3 ]; do
|
while [ $COUNTER -lt 3 ]; do
|
||||||
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
||||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
||||||
|
@ -24,70 +27,195 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
fi
|
fi
|
||||||
((COUNTER++))
|
((COUNTER++))
|
||||||
done
|
done
|
||||||
|
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||||
|
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
|
# Set sed arguments.
|
||||||
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
|
# Set the SED command arguments for column-names with reserved words:
|
||||||
|
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
||||||
|
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
||||||
|
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
||||||
|
|
||||||
|
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
||||||
|
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
||||||
|
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
||||||
|
|
||||||
|
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
||||||
|
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
||||||
|
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
|
|
||||||
db=$1
|
db=$1
|
||||||
FILE=("hive_wf_tmp_"$RANDOM)
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/
|
|
||||||
|
|
||||||
# change ownership to impala
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
if [ -n "$log_errors" ]; then
|
||||||
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make Impala aware of the deletion of the old DB immediately.
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
|
||||||
# copy the databases from ocean to impala
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
echo "copying $db"
|
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/
|
# Using max memory of: 50 * 6144 = 300 Gb
|
||||||
|
# Using 1MB as a buffer-size.
|
||||||
|
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||||
|
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||||
|
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||||
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||||
|
-numListstatusThreads 40 \
|
||||||
|
-copybuffersize 1048576 \
|
||||||
|
-strategy dynamic \
|
||||||
|
-pb \
|
||||||
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
# Check the exit status of the "hadoop distcp" command.
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||||
|
else
|
||||||
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
# drop tables from db
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
||||||
do
|
|
||||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
|
|
||||||
done
|
|
||||||
|
|
||||||
# drop views from db
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
|
||||||
do
|
|
||||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
|
|
||||||
done
|
|
||||||
|
|
||||||
# delete the database
|
# create the new database (with the same name)
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||||
|
|
||||||
# create the databases
|
# Make Impala aware of the creation of the new DB immediately.
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||||
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||||
|
|
||||||
impala-shell -q "INVALIDATE METADATA"
|
all_create_view_statements=()
|
||||||
echo "creating schema for ${db}"
|
|
||||||
for (( k = 0; k < 5; k ++ )); do
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in `impala-shell -d ${db} --delimited -q "show tables"`;
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
do
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
impala-shell -d ${db} --delimited -q "show create table $i";
|
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
||||||
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
||||||
|
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
||||||
|
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
||||||
|
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
||||||
|
all_create_view_statements+=("$create_view_statement")
|
||||||
|
else
|
||||||
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
|
else
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
if [ -n "$log_errors" ]; then
|
||||||
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# load the data from /tmp in the respective tables
|
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||||
echo "copying data in tables and computing stats"
|
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
|
||||||
do
|
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
|
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
|
|
||||||
done
|
|
||||||
|
|
||||||
# deleting the remaining directory from hdfs
|
# Time to loop through the views and create them.
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
|
# At this point all table-schemas should have been created.
|
||||||
|
|
||||||
|
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||||
|
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||||
|
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||||
|
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||||
|
fi
|
||||||
|
|
||||||
|
level_counter=0
|
||||||
|
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||||
|
((level_counter++))
|
||||||
|
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||||
|
# In this case, we should retry creating this particular view again.
|
||||||
|
should_retry_create_view_statements=()
|
||||||
|
|
||||||
|
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
|
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||||
|
if [ -n "$specific_errors" ]; then
|
||||||
|
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||||
|
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||||
|
should_retry_create_view_statements+=("$create_view_statement")
|
||||||
|
else
|
||||||
|
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||||
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
|
return 3
|
||||||
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
|
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||||
|
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||||
|
else
|
||||||
|
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||||
|
fi
|
||||||
|
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||||
|
done
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
|
||||||
|
echo -e "\nComputing stats for tables..\n"
|
||||||
|
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||||
|
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||||
|
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||||
|
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||||
|
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||||
|
else
|
||||||
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f error.log
|
||||||
|
echo -e "\n\nFinished processing db: ${db}\n\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
MONITOR_DB=$1
|
MONITOR_DB=$1
|
||||||
#HADOOP_USER_NAME=$2
|
#HADOOP_USER_NAME=$2
|
||||||
copydb $MONITOR_DB
|
copydb $MONITOR_DB
|
||||||
|
|
|
@ -6,11 +6,14 @@ then
|
||||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
|
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||||
|
|
||||||
IMPALA_HDFS_NODE=''
|
IMPALA_HDFS_NODE=''
|
||||||
COUNTER=0
|
COUNTER=0
|
||||||
|
|
||||||
while [ $COUNTER -lt 3 ]; do
|
while [ $COUNTER -lt 3 ]; do
|
||||||
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
||||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
||||||
|
@ -24,75 +27,196 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
fi
|
fi
|
||||||
((COUNTER++))
|
((COUNTER++))
|
||||||
done
|
done
|
||||||
|
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||||
|
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
|
# Set sed arguments.
|
||||||
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
|
# Set the SED command arguments for column-names with reserved words:
|
||||||
|
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
||||||
|
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
||||||
|
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
||||||
|
|
||||||
|
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
||||||
|
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
||||||
|
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
||||||
|
|
||||||
|
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
||||||
|
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
||||||
|
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
|
|
||||||
export HADOOP_USER="dimitris.pierrakos"
|
|
||||||
export HADOOP_USER_NAME='dimitris.pierrakos'
|
|
||||||
|
|
||||||
db=$1
|
db=$1
|
||||||
FILE=("hive_wf_tmp_"$RANDOM)
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/φ
|
|
||||||
|
|
||||||
# change ownership to impala
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
if [ -n "$log_errors" ]; then
|
||||||
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make Impala aware of the deletion of the old DB immediately.
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
|
||||||
# copy the databases from ocean to impala
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
echo "copying $db"
|
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/
|
# Using max memory of: 50 * 6144 = 300 Gb
|
||||||
|
# Using 1MB as a buffer-size.
|
||||||
|
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||||
|
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||||
|
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||||
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||||
|
-numListstatusThreads 40 \
|
||||||
|
-copybuffersize 1048576 \
|
||||||
|
-strategy dynamic \
|
||||||
|
-pb \
|
||||||
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
# Check the exit status of the "hadoop distcp" command.
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||||
|
else
|
||||||
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
# drop tables from db
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
||||||
do
|
|
||||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
|
|
||||||
done
|
|
||||||
|
|
||||||
# drop views from db
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
|
||||||
do
|
|
||||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
|
|
||||||
done
|
|
||||||
|
|
||||||
# delete the database
|
# create the new database (with the same name)
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||||
|
|
||||||
# create the databases
|
# Make Impala aware of the creation of the new DB immediately.
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||||
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||||
|
|
||||||
impala-shell -q "INVALIDATE METADATA"
|
all_create_view_statements=()
|
||||||
echo "creating schema for ${db}"
|
|
||||||
for (( k = 0; k < 5; k ++ )); do
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in `impala-shell -d ${db} --delimited -q "show tables"`;
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
do
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
impala-shell -d ${db} --delimited -q "show create table $i";
|
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
||||||
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
||||||
|
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
||||||
|
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
||||||
|
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
||||||
|
all_create_view_statements+=("$create_view_statement")
|
||||||
|
else
|
||||||
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
|
else
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
if [ -n "$log_errors" ]; then
|
||||||
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# load the data from /tmp in the respective tables
|
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||||
echo "copying data in tables and computing stats"
|
|
||||||
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
|
||||||
do
|
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
|
|
||||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
|
|
||||||
done
|
|
||||||
|
|
||||||
# deleting the remaining directory from hdfs
|
# Time to loop through the views and create them.
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
|
# At this point all table-schemas should have been created.
|
||||||
|
|
||||||
|
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||||
|
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||||
|
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||||
|
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||||
|
fi
|
||||||
|
|
||||||
|
level_counter=0
|
||||||
|
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||||
|
((level_counter++))
|
||||||
|
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||||
|
# In this case, we should retry creating this particular view again.
|
||||||
|
should_retry_create_view_statements=()
|
||||||
|
|
||||||
|
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
|
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||||
|
if [ -n "$specific_errors" ]; then
|
||||||
|
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||||
|
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||||
|
should_retry_create_view_statements+=("$create_view_statement")
|
||||||
|
else
|
||||||
|
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||||
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
|
return 3
|
||||||
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
|
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||||
|
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||||
|
else
|
||||||
|
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||||
|
fi
|
||||||
|
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||||
|
done
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
|
||||||
|
echo -e "\nComputing stats for tables..\n"
|
||||||
|
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||||
|
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||||
|
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||||
|
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||||
|
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||||
|
else
|
||||||
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f error.log
|
||||||
|
echo -e "\n\nFinished processing db: ${db}\n\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
MONITOR_DB=$1
|
MONITOR_DB=$1
|
||||||
#HADOOP_USER_NAME=$2
|
|
||||||
|
|
||||||
copydb $MONITOR_DB'_institutions'
|
copydb $MONITOR_DB'_institutions'
|
||||||
copydb $MONITOR_DB
|
copydb $MONITOR_DB
|
||||||
|
|
|
@ -6,9 +6,13 @@ then
|
||||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
|
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||||
|
|
||||||
IMPALA_HDFS_NODE=''
|
IMPALA_HDFS_NODE=''
|
||||||
COUNTER=0
|
COUNTER=0
|
||||||
|
|
||||||
while [ $COUNTER -lt 3 ]; do
|
while [ $COUNTER -lt 3 ]; do
|
||||||
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
||||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
||||||
|
@ -22,76 +26,195 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
fi
|
fi
|
||||||
((COUNTER++))
|
((COUNTER++))
|
||||||
done
|
done
|
||||||
|
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||||
|
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
# Set sed arguments.
|
||||||
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
|
# Set the SED command arguments for column-names with reserved words:
|
||||||
|
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
||||||
|
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
||||||
|
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
||||||
|
|
||||||
|
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
||||||
|
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
||||||
|
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
||||||
|
|
||||||
|
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
||||||
|
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
||||||
|
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
||||||
|
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$6
|
export HADOOP_USER_NAME=$6
|
||||||
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
FILE=("hive_wf_tmp_"$RANDOM)
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/
|
|
||||||
# copy the databases from ocean to impala
|
|
||||||
|
|
||||||
echo "copying $db"
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
if [ -n "$log_errors" ]; then
|
||||||
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
# change ownership to impala
|
# Make Impala aware of the deletion of the old DB immediately.
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
|
||||||
# drop tables from db
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||||
do
|
# Using max memory of: 50 * 6144 = 300 Gb
|
||||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
|
# Using 1MB as a buffer-size.
|
||||||
done
|
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||||
|
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||||
|
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||||
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||||
|
-numListstatusThreads 40 \
|
||||||
|
-copybuffersize 1048576 \
|
||||||
|
-strategy dynamic \
|
||||||
|
-pb \
|
||||||
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# drop views from db
|
# Check the exit status of the "hadoop distcp" command.
|
||||||
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
if [ $? -eq 0 ]; then
|
||||||
do
|
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
|
else
|
||||||
done
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
# delete the database
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
|
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
||||||
|
|
||||||
# create the databases
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
|
|
||||||
|
|
||||||
impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
|
# create the new database (with the same name)
|
||||||
echo "creating schema for ${db}"
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||||
for (( k = 0; k < 5; k ++ )); do
|
|
||||||
for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
|
# Make Impala aware of the creation of the new DB immediately.
|
||||||
do
|
sleep 1
|
||||||
impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
sleep 1
|
||||||
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||||
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||||
|
|
||||||
|
all_create_view_statements=()
|
||||||
|
|
||||||
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
|
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
|
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
||||||
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
||||||
|
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
||||||
|
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
||||||
|
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
||||||
|
all_create_view_statements+=("$create_view_statement")
|
||||||
|
else
|
||||||
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
|
else
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
if [ -n "$log_errors" ]; then
|
||||||
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
|
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||||
# do
|
|
||||||
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
|
|
||||||
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
|
||||||
#
|
|
||||||
# # run the same command twice because we may have failures in the first run (due to views pointing to the same db)
|
|
||||||
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
|
|
||||||
# do
|
|
||||||
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
|
|
||||||
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
|
||||||
|
|
||||||
# load the data from /tmp in the respective tables
|
# Time to loop through the views and create them.
|
||||||
echo "copying data in tables and computing stats"
|
# At this point all table-schemas should have been created.
|
||||||
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
|
||||||
do
|
|
||||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
|
|
||||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
|
|
||||||
done
|
|
||||||
|
|
||||||
# deleting the remaining directory from hdfs
|
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
|
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||||
|
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||||
|
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
else
|
||||||
|
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||||
|
fi
|
||||||
|
|
||||||
|
level_counter=0
|
||||||
|
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||||
|
((level_counter++))
|
||||||
|
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||||
|
# In this case, we should retry creating this particular view again.
|
||||||
|
should_retry_create_view_statements=()
|
||||||
|
|
||||||
|
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
|
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||||
|
if [ -n "$specific_errors" ]; then
|
||||||
|
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||||
|
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||||
|
should_retry_create_view_statements+=("$create_view_statement")
|
||||||
|
else
|
||||||
|
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||||
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
|
return 3
|
||||||
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
|
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||||
|
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||||
|
else
|
||||||
|
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||||
|
fi
|
||||||
|
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||||
|
done
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||||
|
sleep 1
|
||||||
|
|
||||||
|
echo -e "\nComputing stats for tables..\n"
|
||||||
|
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||||
|
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||||
|
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||||
|
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||||
|
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||||
|
else
|
||||||
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f error.log
|
||||||
|
echo -e "\n\nFinished processing db: ${db}\n\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
STATS_DB=$1
|
STATS_DB=$1
|
||||||
|
|
Loading…
Reference in New Issue