forked from D-Net/dnet-hadoop
Merge pull request 'Various fixes in the stats wf' (#430) from antonis.lempesis/dnet-hadoop:beta into beta
Reviewed-on: D-Net/dnet-hadoop#430
This commit is contained in:
commit
908ed9da7a
|
@ -67,24 +67,21 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 1
|
exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Make Impala aware of the deletion of the old DB immediately.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
|
||||||
# Using max memory of: 50 * 6144 = 300 Gb
|
# Using max memory of: 70 * 6144 = 430 Gb
|
||||||
# Using 1MB as a buffer-size.
|
# Using 1MB as a buffer-size.
|
||||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
# The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
|
||||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||||
-numListstatusThreads 40 \
|
-numListstatusThreads 40 \
|
||||||
-copybuffersize 1048576 \
|
-copybuffersize 1048576 \
|
||||||
-strategy dynamic \
|
-strategy dynamic \
|
||||||
|
-blocksperchunk 8 \
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
|
@ -92,9 +89,9 @@ function copydb() {
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 2
|
exit 3
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||||
|
@ -105,14 +102,11 @@ function copydb() {
|
||||||
# create the new database (with the same name)
|
# create the new database (with the same name)
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||||
|
|
||||||
# Make Impala aware of the creation of the new DB immediately.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||||
|
|
||||||
all_create_view_statements=()
|
all_create_view_statements=()
|
||||||
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
@ -129,9 +123,11 @@ function copydb() {
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
((num_tables++))
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
|
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
@ -142,74 +138,73 @@ function copydb() {
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
previous_num_of_views_to_retry=${#all_create_view_statements[@]}
|
||||||
|
if [[ $num_tables -gt 0 ]]; then
|
||||||
|
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
|
||||||
|
else
|
||||||
|
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
|
||||||
|
fi
|
||||||
|
|
||||||
# Time to loop through the views and create them.
|
|
||||||
# At this point all table-schemas should have been created.
|
|
||||||
|
|
||||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
|
||||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
else
|
else
|
||||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
level_counter=0
|
level_counter=0
|
||||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
|
||||||
((level_counter++))
|
((level_counter++))
|
||||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||||
# In this case, we should retry creating this particular view again.
|
# In this case, we should retry creating this particular view again.
|
||||||
should_retry_create_view_statements=()
|
new_num_of_views_to_retry=0
|
||||||
|
|
||||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||||
if [ -n "$specific_errors" ]; then
|
if [ -n "$specific_errors" ]; then
|
||||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
|
||||||
should_retry_create_view_statements+=("$create_view_statement")
|
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
|
||||||
else
|
else
|
||||||
|
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
|
||||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
|
||||||
|
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
return 3
|
exit 5
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
|
||||||
else
|
else
|
||||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||||
fi
|
fi
|
||||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||||
done
|
done
|
||||||
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
|
|
||||||
echo -e "\nComputing stats for tables..\n"
|
echo -e "\nComputing stats for tables..\n"
|
||||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||||
|
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||||
|
sleep 1
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
|
||||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 4
|
exit 6
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
|
|
@ -66,24 +66,21 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 1
|
exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Make Impala aware of the deletion of the old DB immediately.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
|
||||||
# Using max memory of: 50 * 6144 = 300 Gb
|
# Using max memory of: 70 * 6144 = 430 Gb
|
||||||
# Using 1MB as a buffer-size.
|
# Using 1MB as a buffer-size.
|
||||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
# The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
|
||||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||||
-numListstatusThreads 40 \
|
-numListstatusThreads 40 \
|
||||||
-copybuffersize 1048576 \
|
-copybuffersize 1048576 \
|
||||||
-strategy dynamic \
|
-strategy dynamic \
|
||||||
|
-blocksperchunk 8 \
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
|
@ -91,9 +88,9 @@ function copydb() {
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 2
|
exit 3
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||||
|
@ -104,14 +101,11 @@ function copydb() {
|
||||||
# create the new database (with the same name)
|
# create the new database (with the same name)
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||||
|
|
||||||
# Make Impala aware of the creation of the new DB immediately.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||||
|
|
||||||
all_create_view_statements=()
|
all_create_view_statements=()
|
||||||
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
@ -128,9 +122,11 @@ function copydb() {
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
((num_tables++))
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
|
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
@ -141,74 +137,73 @@ function copydb() {
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
previous_num_of_views_to_retry=${#all_create_view_statements[@]}
|
||||||
|
if [[ $num_tables -gt 0 ]]; then
|
||||||
|
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
|
||||||
|
else
|
||||||
|
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
|
||||||
|
fi
|
||||||
|
|
||||||
# Time to loop through the views and create them.
|
|
||||||
# At this point all table-schemas should have been created.
|
|
||||||
|
|
||||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
|
||||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
else
|
else
|
||||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
level_counter=0
|
level_counter=0
|
||||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
|
||||||
((level_counter++))
|
((level_counter++))
|
||||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||||
# In this case, we should retry creating this particular view again.
|
# In this case, we should retry creating this particular view again.
|
||||||
should_retry_create_view_statements=()
|
new_num_of_views_to_retry=0
|
||||||
|
|
||||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||||
if [ -n "$specific_errors" ]; then
|
if [ -n "$specific_errors" ]; then
|
||||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
|
||||||
should_retry_create_view_statements+=("$create_view_statement")
|
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
|
||||||
else
|
else
|
||||||
|
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
|
||||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
|
||||||
|
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
return 3
|
exit 5
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
|
||||||
else
|
else
|
||||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||||
fi
|
fi
|
||||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||||
done
|
done
|
||||||
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
|
|
||||||
echo -e "\nComputing stats for tables..\n"
|
echo -e "\nComputing stats for tables..\n"
|
||||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||||
|
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||||
|
sleep 1
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
|
||||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 4
|
exit 6
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
|
|
@ -66,24 +66,21 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 1
|
exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Make Impala aware of the deletion of the old DB immediately.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
|
||||||
# Using max memory of: 50 * 6144 = 300 Gb
|
# Using max memory of: 70 * 6144 = 430 Gb
|
||||||
# Using 1MB as a buffer-size.
|
# Using 1MB as a buffer-size.
|
||||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
# The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
|
||||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||||
-numListstatusThreads 40 \
|
-numListstatusThreads 40 \
|
||||||
-copybuffersize 1048576 \
|
-copybuffersize 1048576 \
|
||||||
-strategy dynamic \
|
-strategy dynamic \
|
||||||
|
-blocksperchunk 8 \
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
|
@ -91,9 +88,9 @@ function copydb() {
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 2
|
exit 3
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||||
|
@ -104,14 +101,11 @@ function copydb() {
|
||||||
# create the new database (with the same name)
|
# create the new database (with the same name)
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||||
|
|
||||||
# Make Impala aware of the creation of the new DB immediately.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||||
|
|
||||||
all_create_view_statements=()
|
all_create_view_statements=()
|
||||||
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
@ -128,9 +122,11 @@ function copydb() {
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
((num_tables++))
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
|
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
@ -141,74 +137,73 @@ function copydb() {
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
previous_num_of_views_to_retry=${#all_create_view_statements[@]}
|
||||||
|
if [[ $num_tables -gt 0 ]]; then
|
||||||
|
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
|
||||||
|
else
|
||||||
|
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
|
||||||
|
fi
|
||||||
|
|
||||||
# Time to loop through the views and create them.
|
|
||||||
# At this point all table-schemas should have been created.
|
|
||||||
|
|
||||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
|
||||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
else
|
else
|
||||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
level_counter=0
|
level_counter=0
|
||||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
|
||||||
((level_counter++))
|
((level_counter++))
|
||||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||||
# In this case, we should retry creating this particular view again.
|
# In this case, we should retry creating this particular view again.
|
||||||
should_retry_create_view_statements=()
|
new_num_of_views_to_retry=0
|
||||||
|
|
||||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||||
if [ -n "$specific_errors" ]; then
|
if [ -n "$specific_errors" ]; then
|
||||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
|
||||||
should_retry_create_view_statements+=("$create_view_statement")
|
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
|
||||||
else
|
else
|
||||||
|
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
|
||||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
|
||||||
|
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
return 3
|
exit 5
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
|
||||||
else
|
else
|
||||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||||
fi
|
fi
|
||||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||||
done
|
done
|
||||||
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
|
|
||||||
echo -e "\nComputing stats for tables..\n"
|
echo -e "\nComputing stats for tables..\n"
|
||||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||||
|
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||||
|
sleep 1
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
|
||||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 4
|
exit 6
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
|
|
@ -68,24 +68,21 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 1
|
exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Make Impala aware of the deletion of the old DB immediately.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
|
||||||
# Using max memory of: 50 * 6144 = 300 Gb
|
# Using max memory of: 70 * 6144 = 430 Gb
|
||||||
# Using 1MB as a buffer-size.
|
# Using 1MB as a buffer-size.
|
||||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
# The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
|
||||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||||
-numListstatusThreads 40 \
|
-numListstatusThreads 40 \
|
||||||
-copybuffersize 1048576 \
|
-copybuffersize 1048576 \
|
||||||
-strategy dynamic \
|
-strategy dynamic \
|
||||||
|
-blocksperchunk 8 \
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
|
@ -93,9 +90,9 @@ function copydb() {
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 2
|
exit 3
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||||
|
@ -106,14 +103,11 @@ function copydb() {
|
||||||
# create the new database (with the same name)
|
# create the new database (with the same name)
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||||
|
|
||||||
# Make Impala aware of the creation of the new DB immediately.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||||
|
|
||||||
all_create_view_statements=()
|
all_create_view_statements=()
|
||||||
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
|
@ -130,9 +124,11 @@ function copydb() {
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
((num_tables++))
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
|
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
|
@ -143,74 +139,73 @@ function copydb() {
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
previous_num_of_views_to_retry=${#all_create_view_statements[@]}
|
||||||
|
if [[ $num_tables -gt 0 ]]; then
|
||||||
|
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
|
||||||
|
else
|
||||||
|
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
|
||||||
|
fi
|
||||||
|
|
||||||
# Time to loop through the views and create them.
|
|
||||||
# At this point all table-schemas should have been created.
|
|
||||||
|
|
||||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
|
||||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
else
|
else
|
||||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
level_counter=0
|
level_counter=0
|
||||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
|
||||||
((level_counter++))
|
((level_counter++))
|
||||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||||
# In this case, we should retry creating this particular view again.
|
# In this case, we should retry creating this particular view again.
|
||||||
should_retry_create_view_statements=()
|
new_num_of_views_to_retry=0
|
||||||
|
|
||||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||||
if [ -n "$specific_errors" ]; then
|
if [ -n "$specific_errors" ]; then
|
||||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
|
||||||
should_retry_create_view_statements+=("$create_view_statement")
|
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
|
||||||
else
|
else
|
||||||
|
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
|
||||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
|
||||||
|
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
return 3
|
exit 5
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
|
||||||
else
|
else
|
||||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||||
fi
|
fi
|
||||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||||
done
|
done
|
||||||
|
|
||||||
sleep 1
|
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
|
||||||
sleep 1
|
|
||||||
|
|
||||||
echo -e "\nComputing stats for tables..\n"
|
echo -e "\nComputing stats for tables..\n"
|
||||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||||
|
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||||
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||||
|
sleep 1
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
|
||||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
return 4
|
exit 6
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
set mapred.job.queue.name=analytics;
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
-- Additional relations
|
-- Additional relations
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
-- Additional relations
|
-- Additional relations
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
-------------------------------------------
|
-------------------------------------------
|
||||||
--- Extra tables, mostly used by indicators
|
--- Extra tables, mostly used by indicators
|
||||||
|
|
||||||
|
|
|
@ -249,7 +249,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
|
||||||
left semi join dd on dd.id=pd.datasource
|
left semi join dd on dd.id=pd.datasource
|
||||||
union all
|
union all
|
||||||
select ra.id, 1 as is_gold
|
select ra.id, 1 as is_gold
|
||||||
from ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/
|
from ${stats_db_name}.result_accessroute ra where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as
|
create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as
|
||||||
|
@ -294,7 +294,7 @@ left outer join (
|
||||||
join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=p.id
|
join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=p.id
|
||||||
left outer join ${stats_db_name}.result_accessroute ra on ra.id=p.id
|
left outer join ${stats_db_name}.result_accessroute ra on ra.id=p.id
|
||||||
where indi_gold.is_gold=0 and
|
where indi_gold.is_gold=0 and
|
||||||
((d.type like '%Journal%' and ri.accessright not in ('Closed Access', 'Restricted', 'Not Available') and ri.license is not null) or ra.accessroute='hybrid')) tmp on pd.i=tmp.id; /*EOS*/
|
((d.type like '%Journal%' and ri.accessright not in ('Closed Access', 'Restricted', 'Not Available') and ri.license is not null) or ra.accessroute='hybrid')) tmp on p.id=tmp.id; /*EOS*/
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_org_fairness purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_fairness purge; /*EOS*/
|
||||||
create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as
|
create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as
|
||||||
|
@ -380,7 +380,7 @@ CREATE TEMPORARY VIEW allresults as
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub as
|
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
|
||||||
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||||
from allresults ar join result_fair rf
|
from allresults ar join result_fair rf
|
||||||
on rf.organization=ar.organization; /*EOS*/
|
on rf.organization=ar.organization; /*EOS*/
|
||||||
|
@ -639,7 +639,7 @@ from ${stats_db_name}.publication p
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_result_with_pid as
|
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
|
||||||
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
||||||
from ${stats_db_name}.result p
|
from ${stats_db_name}.result p
|
||||||
left outer join (
|
left outer join (
|
||||||
|
@ -653,7 +653,7 @@ group by rf.id; /*EOS*/
|
||||||
|
|
||||||
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
|
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
|
||||||
|
|
||||||
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity as
|
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
|
||||||
select distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
select distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
||||||
as is_interdisciplinary
|
as is_interdisciplinary
|
||||||
from pub_fos_totals p
|
from pub_fos_totals p
|
||||||
|
@ -1006,14 +1006,14 @@ left outer join (
|
||||||
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
|
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
|
||||||
|
|
||||||
create table ${stats_db_name}.result_country stored as parquet as
|
create table ${stats_db_name}.result_country stored as parquet as
|
||||||
select distinct *
|
select distinct id, country
|
||||||
from (
|
from (
|
||||||
select ro.id, o.country
|
select ro.id, o.country
|
||||||
from ${stats_db_name}.result_organization ro
|
from ${stats_db_name}.result_organization ro
|
||||||
left outer join ${stats_db_name}.organization o on o.id=ro.organization
|
left outer join ${stats_db_name}.organization o on o.id=ro.organization
|
||||||
union all
|
union all
|
||||||
select rp.id, f.country
|
select rp.id, f.country
|
||||||
from ${stats_db_name}.result_projects
|
from ${stats_db_name}.result_projects rp
|
||||||
left outer join ${stats_db_name}.project p on p.id=rp.project
|
left outer join ${stats_db_name}.project p on p.id=rp.project
|
||||||
left outer join ${stats_db_name}.funder f on f.name=p.funder
|
left outer join ${stats_db_name}.funder f on f.name=p.funder
|
||||||
) rc
|
) rc
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
-- Shortcuts for various definitions in stats db ---
|
-- Shortcuts for various definitions in stats db ---
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||||
-- peer reviewed)
|
-- peer reviewed)
|
||||||
drop table if exists ${stats_db_name}.result_tmp;
|
drop table if exists ${stats_db_name}.result_tmp;
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
set mapred.job.queue.name=analytics;
|
||||||
|
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
--------------------------------------------------------------
|
--------------------------------------------------------------
|
||||||
-- Publication table/view and Publication related tables/views
|
-- Publication table/view and Publication related tables/views
|
||||||
|
|
|
@ -368,6 +368,7 @@
|
||||||
${sparkClusterOpts}
|
${sparkClusterOpts}
|
||||||
${sparkResourceOpts}
|
${sparkResourceOpts}
|
||||||
${sparkApplicationOpts}
|
${sparkApplicationOpts}
|
||||||
|
--queue analytics
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql</arg>
|
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql</arg>
|
||||||
|
|
|
@ -30,6 +30,10 @@
|
||||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
<value>${oozieLauncherQueueName}</value>
|
<value>${oozieLauncherQueueName}</value>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>mapred.child.java.opts</name>
|
||||||
|
<value>-Xmx16g</value>
|
||||||
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue