LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g"# This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'# the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade"|& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
echo"Copying $db files from Ocean to Impala cluster.."
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb
# Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_commands=()
for i in `hive -e "show tables in ${db};"| sed 's/WARN:.*//g'`;do# Get the tables and views without any potential the "WARN" logs.
# Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
create_entity_command=`hive -e "show create table ${db}.${i};"`# It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
create_view_command=`echo -e "$create_entity_command"| sed 's/WARN:.*//g'| sed 's/\`//g'\
| sed 's/"$/;/'| sed 's/^"//'| sed 's/\\"\\"/\"/g'| sed -e "${LOCATION_HDFS_NODE_SED_ARG}"| sed "${DATE_SED_ARG_1}"| sed "${HASH_SED_ARG_1}"| sed "${LOCATION_SED_ARG_1}"\
| sed "${DATE_SED_ARG_2}"| sed "${HASH_SED_ARG_2}"| sed "${LOCATION_SED_ARG_2}"\
| sed "${DATE_SED_ARG_3}"| sed "${HASH_SED_ARG_3}"| sed "${LOCATION_SED_ARG_3}"`
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again.
should_retry=0# We should NOT do another iteration, unless at least one view could NOT be created.
should_retry_create_view_commands=()
for create_view_command in "${all_create_view_commands[@]}";do# Get the tables and views without any potential the "WARN" logs.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}"|& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
echo -e "\nspecific_errors: ${specific_errors}\n"
if[ -n "$specific_errors"];then
echo -e "\nView '$(cat error.log | grep "CREATE VIEW "| sed 's/CREATE VIEW //g'| sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`;do
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_command=`hive -e "show create table ${db}.${i};"| grep "CREATE VIEW"`# This grep works here, as we do not want to match multiple-lines.
if[ -z "$create_view_command"];then# If it's a table, then go load the data to it.