LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g"# This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'# the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade"|& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"`# It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
| sed 's/"$/;/'| sed 's/^"//'| sed 's/\\"\\"/\"/g'| sed -e "${LOCATION_HDFS_NODE_SED_ARG}"| sed "${DATE_SED_ARG_1}"| sed "${HASH_SED_ARG_1}"| sed "${LOCATION_SED_ARG_1}"\
| sed "${DATE_SED_ARG_2}"| sed "${HASH_SED_ARG_2}"| sed "${LOCATION_SED_ARG_2}"\
| sed "${DATE_SED_ARG_3}"| sed "${HASH_SED_ARG_3}"| sed "${LOCATION_SED_ARG_3}"`
for create_view_statement in "${all_create_view_statements[@]}";do# Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}"|& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)"| sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
((new_num_of_views_to_retry++))# Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
all_create_view_statements=("$(echo"${all_create_view_statements[@]}"| grep -v '^[\s]*$')")# Re-index the array, filtering-out any empty elements.
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)