From a8d68c9d294518734d9e6ac724ab40d9e4ff5d3d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 11 Jun 2024 12:40:50 +0200 Subject: [PATCH 1/7] avoid NPEs --- .../oaf/utils/MergeEntitiesComparator.java | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java index 5792fc10f..ff6c2689a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeEntitiesComparator.java @@ -1,13 +1,9 @@ package eu.dnetlib.dhp.schema.oaf.utils; -import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; - import java.util.*; -import java.util.stream.Collectors; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Result; @@ -42,17 +38,23 @@ public class MergeEntitiesComparator implements Comparator { int res = 0; // pid authority - int cfp1 = left - .getCollectedfrom() - .stream() - .map(kv -> PID_AUTHORITIES.indexOf(kv.getKey())) - .max(Integer::compare) + int cfp1 = Optional + .ofNullable(left.getCollectedfrom()) + .map( + cf -> cf + .stream() + .map(kv -> PID_AUTHORITIES.indexOf(kv.getKey())) + .max(Integer::compare) + .orElse(-1)) .orElse(-1); - int cfp2 = right - .getCollectedfrom() - .stream() - .map(kv -> PID_AUTHORITIES.indexOf(kv.getKey())) - .max(Integer::compare) + int cfp2 = Optional + .ofNullable(right.getCollectedfrom()) + .map( + cf -> cf + .stream() + .map(kv -> PID_AUTHORITIES.indexOf(kv.getKey())) + .max(Integer::compare) + .orElse(-1)) .orElse(-1); if (cfp1 >= 0 && cfp1 > cfp2) { From 11fe3a4fe0af870ee519de5dd546d2d28483c4a4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 11 Jun 2024 14:21:01 +0200 Subject: [PATCH 2/7] [graph resolution] use sparkExecutorMemory to define also the memoryOverhead --- .../eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml index 74e792f07..916a9f2b1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml @@ -45,6 +45,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.shuffle.partitions=15000 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -79,6 +80,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.shuffle.partitions=10000 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} From d90cb099b8deababf4428626fd9e7f164fbe2ae1 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Tue, 11 Jun 2024 20:23:44 +0200 Subject: [PATCH 3/7] Fix for paginationStart parameter management --- .../eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index 9037a454e..caef266d7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -249,7 +249,8 @@ public class RestIterator implements Iterator { String[] pageVal = m.group(0).split("="); pagination = Integer.parseInt(pageVal[1]); - // remove page start number from queryParams + // remove page start number from query and queryParams + queryParams = queryParams.replaceFirst("&?paginationStart=[0-9]+", ""); query = query.replaceFirst("&?paginationStart=[0-9]+", ""); } From 9bf2bda1c6916ea45675a71e9c115c83182371bd Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 12 Jun 2024 13:28:51 +0200 Subject: [PATCH 4/7] Fix: next returned a null value at end of stream --- .../collection/plugin/rest/RestIterator.java | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index caef266d7..0895d5f43 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -176,19 +176,6 @@ public class RestIterator implements Iterator { */ @Override public boolean hasNext() { - if (this.recordQueue.isEmpty() && this.query.isEmpty()) { - disconnect(); - return false; - } - return true; - } - - /* - * (non-Javadoc) - * @see java.util.Iterator#next() - */ - @Override - public String next() { synchronized (this.recordQueue) { while (this.recordQueue.isEmpty() && !this.query.isEmpty()) { try { @@ -198,6 +185,23 @@ public class RestIterator implements Iterator { throw new RuntimeException(e); } } + + if (!this.recordQueue.isEmpty()) { + return true; + } + + disconnect(); + return false; + } + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#next() + */ + @Override + public String next() { + synchronized (this.recordQueue) { return this.recordQueue.poll(); } } From d942a1101bfb60b536f56e39ab9a7a00925ac6f3 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 14 Jun 2024 12:14:38 +0300 Subject: [PATCH 5/7] Miscellaneous updates to the copying operation to Impala Cluster: - Show some counts and the elapsed time for various sub-tasks. - Code polishing. --- .../oozie_app/copyDataToImpalaCluster.sh | 35 +++++++++++-------- .../oozie_app/copyDataToImpalaCluster.sh | 35 +++++++++++-------- .../oozie_app/copyDataToImpalaCluster.sh | 35 +++++++++++-------- .../oozie_app/copyDataToImpalaCluster.sh | 35 +++++++++++-------- 4 files changed, 80 insertions(+), 60 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index ca0f7a643..55deada40 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -55,20 +55,20 @@ function print_elapsed_time() hours=$((elapsed_time / 3600)) minutes=$(((elapsed_time % 3600) / 60)) seconds=$((elapsed_time % 60)) - printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds + printf "%02d:%02d:%02d" $hours $minutes $seconds } function copydb() { db=$1 - start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" + start_db_time=$(date +%s) # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 2 @@ -78,6 +78,7 @@ function copydb() { fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" + start_file_transfer_time=$(date +%s) # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max memory of: 70 * 6144 = 430 Gb # Using 1MB as a buffer-size. @@ -93,7 +94,7 @@ function copydb() { ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. - echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" + echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster, after: $(print_elapsed_time start_file_transfer_time)\n" else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log @@ -118,6 +119,7 @@ function copydb() { fi echo -e "\nCreating schema for db: '${db}'\n" + start_create_schema_time=$(date +%s) # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -128,7 +130,8 @@ function copydb() { all_create_view_statements=() num_tables=0 - entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. + entities_on_ocean=(`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`) # Get the tables and views without any potential the "WARN" logs. + echo -e "\nGoing to create ${#entities_on_ocean[@]} entities for db '${db}'..\n" for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. @@ -152,8 +155,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + rm -f error.log exit 6 - fi + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. fi fi fi @@ -208,8 +212,11 @@ function copydb() { previous_num_of_views_to_retry=$new_num_of_views_to_retry done + entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`) + echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n" + + start_compute_stats_time=$(date +%s) echo -e "\nComputing stats for tables..\n" - entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. @@ -221,20 +228,18 @@ function copydb() { fi done + echo -e "\nFinished computing stats for tables, after: $(print_elapsed_time start_compute_stats_time)\n" + rm -f error.log # Cleanup the temp log-file. + # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) - if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then - echo -e "\nAll entities have been copied to Impala cluster.\n" + if [[ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]]; then + echo -e "\nAll entities have been copied to Impala cluster.\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" else - echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" - rm -f error.log + echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 8 fi fi - - rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n" - print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index ca0f7a643..55deada40 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -55,20 +55,20 @@ function print_elapsed_time() hours=$((elapsed_time / 3600)) minutes=$(((elapsed_time % 3600) / 60)) seconds=$((elapsed_time % 60)) - printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds + printf "%02d:%02d:%02d" $hours $minutes $seconds } function copydb() { db=$1 - start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" + start_db_time=$(date +%s) # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 2 @@ -78,6 +78,7 @@ function copydb() { fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" + start_file_transfer_time=$(date +%s) # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max memory of: 70 * 6144 = 430 Gb # Using 1MB as a buffer-size. @@ -93,7 +94,7 @@ function copydb() { ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. - echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" + echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster, after: $(print_elapsed_time start_file_transfer_time)\n" else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log @@ -118,6 +119,7 @@ function copydb() { fi echo -e "\nCreating schema for db: '${db}'\n" + start_create_schema_time=$(date +%s) # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -128,7 +130,8 @@ function copydb() { all_create_view_statements=() num_tables=0 - entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. + entities_on_ocean=(`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`) # Get the tables and views without any potential the "WARN" logs. + echo -e "\nGoing to create ${#entities_on_ocean[@]} entities for db '${db}'..\n" for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. @@ -152,8 +155,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + rm -f error.log exit 6 - fi + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. fi fi fi @@ -208,8 +212,11 @@ function copydb() { previous_num_of_views_to_retry=$new_num_of_views_to_retry done + entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`) + echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n" + + start_compute_stats_time=$(date +%s) echo -e "\nComputing stats for tables..\n" - entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. @@ -221,20 +228,18 @@ function copydb() { fi done + echo -e "\nFinished computing stats for tables, after: $(print_elapsed_time start_compute_stats_time)\n" + rm -f error.log # Cleanup the temp log-file. + # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) - if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then - echo -e "\nAll entities have been copied to Impala cluster.\n" + if [[ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]]; then + echo -e "\nAll entities have been copied to Impala cluster.\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" else - echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" - rm -f error.log + echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 8 fi fi - - rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n" - print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index dd2203eef..43498abd2 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -55,20 +55,20 @@ function print_elapsed_time() hours=$((elapsed_time / 3600)) minutes=$(((elapsed_time % 3600) / 60)) seconds=$((elapsed_time % 60)) - printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds + printf "%02d:%02d:%02d" $hours $minutes $seconds } function copydb() { db=$1 - start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" + start_db_time=$(date +%s) # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 2 @@ -78,6 +78,7 @@ function copydb() { fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" + start_file_transfer_time=$(date +%s) # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max memory of: 70 * 6144 = 430 Gb # Using 1MB as a buffer-size. @@ -93,7 +94,7 @@ function copydb() { ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. - echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" + echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster, after: $(print_elapsed_time start_file_transfer_time)\n" else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log @@ -118,6 +119,7 @@ function copydb() { fi echo -e "\nCreating schema for db: '${db}'\n" + start_create_schema_time=$(date +%s) # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -128,7 +130,8 @@ function copydb() { all_create_view_statements=() num_tables=0 - entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. + entities_on_ocean=(`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`) # Get the tables and views without any potential the "WARN" logs. + echo -e "\nGoing to create ${#entities_on_ocean[@]} entities for db '${db}'..\n" for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. @@ -152,8 +155,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + rm -f error.log exit 6 - fi + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. fi fi fi @@ -208,8 +212,11 @@ function copydb() { previous_num_of_views_to_retry=$new_num_of_views_to_retry done + entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`) + echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n" + + start_compute_stats_time=$(date +%s) echo -e "\nComputing stats for tables..\n" - entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. @@ -221,20 +228,18 @@ function copydb() { fi done + echo -e "\nFinished computing stats for tables, after: $(print_elapsed_time start_compute_stats_time)\n" + rm -f error.log # Cleanup the temp log-file. + # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) - if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then - echo -e "\nAll entities have been copied to Impala cluster.\n" + if [[ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]]; then + echo -e "\nAll entities have been copied to Impala cluster.\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" else - echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" - rm -f error.log + echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 8 fi fi - - rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n" - print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 918775f49..1d5842d06 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -57,20 +57,20 @@ function print_elapsed_time() hours=$((elapsed_time / 3600)) minutes=$(((elapsed_time % 3600) / 60)) seconds=$((elapsed_time % 60)) - printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds + printf "%02d:%02d:%02d" $hours $minutes $seconds } function copydb() { db=$1 - start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" + start_db_time=$(date +%s) # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n" rm -f error.log if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 2 @@ -80,6 +80,7 @@ function copydb() { fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" + start_file_transfer_time=$(date +%s) # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max memory of: 70 * 6144 = 430 Gb # Using 1MB as a buffer-size. @@ -95,7 +96,7 @@ function copydb() { ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command. - echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n" + echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster, after: $(print_elapsed_time start_file_transfer_time)\n" else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log @@ -120,6 +121,7 @@ function copydb() { fi echo -e "\nCreating schema for db: '${db}'\n" + start_create_schema_time=$(date +%s) # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -130,7 +132,8 @@ function copydb() { all_create_view_statements=() num_tables=0 - entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. + entities_on_ocean=(`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`) # Get the tables and views without any potential the "WARN" logs. + echo -e "\nGoing to create ${#entities_on_ocean[@]} entities for db '${db}'..\n" for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. @@ -154,8 +157,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + rm -f error.log exit 6 - fi + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. fi fi fi @@ -210,8 +214,11 @@ function copydb() { previous_num_of_views_to_retry=$new_num_of_views_to_retry done + entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`) + echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n" + + start_compute_stats_time=$(date +%s) echo -e "\nComputing stats for tables..\n" - entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. @@ -223,20 +230,18 @@ function copydb() { fi done + echo -e "\nFinished computing stats for tables, after: $(print_elapsed_time start_compute_stats_time)\n" + rm -f error.log # Cleanup the temp log-file. + # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) - if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then - echo -e "\nAll entities have been copied to Impala cluster.\n" + if [[ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]]; then + echo -e "\nAll entities have been copied to Impala cluster.\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" else - echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" - rm -f error.log + echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 8 fi fi - - rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n" - print_elapsed_time start_db_time } STATS_DB=$1 From 38636942c7af3f400f01618a6667f07fa29268d0 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 14 Jun 2024 15:11:19 +0300 Subject: [PATCH 6/7] filtering out deletedbyinference and invinsible results from accessroute --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 7bad34e86..dafec9a6f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -65,4 +65,5 @@ DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result -lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; + lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute +WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; From 3095047e5e6cf1cba744264296f5e01f96bfb4b4 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 18 Jun 2024 14:40:41 +0300 Subject: [PATCH 7/7] Miscellaneous updates to the copying operation to Impala Cluster: - Fix not breaking out of the VIEWS-infinite-loop when the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" is set to "false". - Exit the script when no HDFS-active-node was found, independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR". - Fix view_name-recognition in a log-message, by using the more advanced "Perl-Compatible Regular Expressions" in "grep". - Add error-handling for "compute stats" errors. --- .../oozie_app/copyDataToImpalaCluster.sh | 24 ++++++++++++------- .../oozie_app/copyDataToImpalaCluster.sh | 24 ++++++++++++------- .../oozie_app/copyDataToImpalaCluster.sh | 24 ++++++++++++------- .../oozie_app/copyDataToImpalaCluster.sh | 24 ++++++++++++------- 4 files changed, 64 insertions(+), 32 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index 55deada40..978cf4a9a 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -32,9 +32,7 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 1 - fi + exit 1 # This is fatal and we have to exit independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config, as none of the DBs will be able to get transferred. fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -148,7 +146,7 @@ function copydb() { echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 5 - fi + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` @@ -188,7 +186,7 @@ function copydb() { specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" - echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" + echo -e "\nView '$(cat error.log | grep -Po "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. else all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. @@ -200,9 +198,11 @@ function copydb() { # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! BREAKING-OUT..\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 7 + else + break # Break form the inf-loop of views and continue by computing stats for the tables. fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" @@ -224,7 +224,15 @@ function copydb() { # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + rm -f error.log + exit 8 + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. + fi fi done @@ -237,7 +245,7 @@ function copydb() { else echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 8 + exit 9 fi fi } diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 55deada40..978cf4a9a 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -32,9 +32,7 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 1 - fi + exit 1 # This is fatal and we have to exit independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config, as none of the DBs will be able to get transferred. fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -148,7 +146,7 @@ function copydb() { echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 5 - fi + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` @@ -188,7 +186,7 @@ function copydb() { specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" - echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" + echo -e "\nView '$(cat error.log | grep -Po "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. else all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. @@ -200,9 +198,11 @@ function copydb() { # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! BREAKING-OUT..\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 7 + else + break # Break form the inf-loop of views and continue by computing stats for the tables. fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" @@ -224,7 +224,15 @@ function copydb() { # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + rm -f error.log + exit 8 + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. + fi fi done @@ -237,7 +245,7 @@ function copydb() { else echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 8 + exit 9 fi fi } diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 43498abd2..55ae3114e 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -32,9 +32,7 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 1 - fi + exit 1 # This is fatal and we have to exit independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config, as none of the DBs will be able to get transferred. fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -148,7 +146,7 @@ function copydb() { echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 5 - fi + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` @@ -188,7 +186,7 @@ function copydb() { specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" - echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" + echo -e "\nView '$(cat error.log | grep -Po "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. else all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. @@ -200,9 +198,11 @@ function copydb() { # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! BREAKING-OUT..\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 7 + else + break # Break form the inf-loop of views and continue by computing stats for the tables. fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" @@ -224,7 +224,15 @@ function copydb() { # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + rm -f error.log + exit 8 + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. + fi fi done @@ -237,7 +245,7 @@ function copydb() { else echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 8 + exit 9 fi fi } diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 1d5842d06..07a8a4534 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -30,9 +30,7 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 1 - fi + exit 1 # This is fatal and we have to exit independently of the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" config, as none of the DBs will be able to get transferred. fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -150,7 +148,7 @@ function copydb() { echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 5 - fi + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` @@ -190,7 +188,7 @@ function copydb() { specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" - echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" + echo -e "\nView '$(cat error.log | grep -Po "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. else all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. @@ -202,9 +200,11 @@ function copydb() { # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! BREAKING-OUT..\n\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then exit 7 + else + break # Break form the inf-loop of views and continue by computing stats for the tables. fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" @@ -226,7 +226,15 @@ function copydb() { # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + rm -f error.log + exit 8 + fi # This error is not FATAL, do we do not return from this function, in normal circumstances. + fi fi done @@ -239,7 +247,7 @@ function copydb() { else echo -e "\n\nERROR: $((${#entities_on_ocean[@]} - ${#entities_on_impala[@]})) ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n\nFinished processing db: '${db}', after: $(print_elapsed_time start_db_time)\n" if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then - exit 8 + exit 9 fi fi }