diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index 059fb9089..26760d650 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -8,6 +8,8 @@ fi export HADOOP_USER_NAME=$2 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0 + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' @@ -30,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - exit 1 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 1 + fi fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -39,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" - # Set sed arguments. LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) -# Set the SED command arguments for column-names with reserved words: -DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' -DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. -DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' -HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' -HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' -HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' - -LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' -LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' -LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' +function print_elapsed_time() +{ + start_time=$1 + end_time=$(date +%s) + elapsed_time=$(($end_time-$start_time)) + hours=$((elapsed_time / 3600)) + minutes=$(((elapsed_time % 3600) / 60)) + seconds=$((elapsed_time % 60)) + printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds +} function copydb() { db=$1 + start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). @@ -67,7 +70,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 2 + fi fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" @@ -91,7 +96,9 @@ function copydb() { else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - exit 3 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 3 + fi fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -109,17 +116,13 @@ function copydb() { num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. - for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. - create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - - create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. if [ -n "$create_view_statement_test" ]; then echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" - create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ - | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ - | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ - | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"` all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" @@ -127,12 +130,17 @@ function copydb() { CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" - exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 5 + fi fi fi fi @@ -176,7 +184,9 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - exit 5 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 6 + fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else @@ -204,11 +214,14 @@ function copydb() { else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - exit 6 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 7 + fi fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" + echo -e "\n\nFinished processing db: ${db}\n" + print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 1130a684d..26760d650 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -8,6 +8,9 @@ fi export HADOOP_USER_NAME=$2 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0 + + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" @@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - exit 1 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 1 + fi fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" - # Set sed arguments. LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) -# Set the SED command arguments for column-names with reserved words: -DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' -DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. -DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' -HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' -HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' -HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' - -LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' -LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' -LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' +function print_elapsed_time() +{ + start_time=$1 + end_time=$(date +%s) + elapsed_time=$(($end_time-$start_time)) + hours=$((elapsed_time / 3600)) + minutes=$(((elapsed_time % 3600) / 60)) + seconds=$((elapsed_time % 60)) + printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds +} function copydb() { db=$1 + start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). @@ -66,7 +70,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 2 + fi fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" @@ -90,7 +96,9 @@ function copydb() { else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - exit 3 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 3 + fi fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -108,17 +116,13 @@ function copydb() { num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. - for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. - create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - - create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. if [ -n "$create_view_statement_test" ]; then echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" - create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ - | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ - | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ - | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"` all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" @@ -126,12 +130,17 @@ function copydb() { CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" - exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 5 + fi fi fi fi @@ -175,7 +184,9 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - exit 5 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 6 + fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else @@ -203,11 +214,14 @@ function copydb() { else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - exit 6 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 7 + fi fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" + echo -e "\n\nFinished processing db: ${db}\n" + print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index de275145b..1ab3e417a 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -8,6 +8,9 @@ fi export HADOOP_USER_NAME=$2 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0 + + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" @@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - exit 1 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 1 + fi fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" - # Set sed arguments. LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) -# Set the SED command arguments for column-names with reserved words: -DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' -DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. -DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' -HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' -HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' -HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' - -LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' -LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' -LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' +function print_elapsed_time() +{ + start_time=$1 + end_time=$(date +%s) + elapsed_time=$(($end_time-$start_time)) + hours=$((elapsed_time / 3600)) + minutes=$(((elapsed_time % 3600) / 60)) + seconds=$((elapsed_time % 60)) + printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds +} function copydb() { db=$1 + start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). @@ -66,7 +70,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 2 + fi fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" @@ -90,7 +96,9 @@ function copydb() { else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - exit 3 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 3 + fi fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -108,17 +116,13 @@ function copydb() { num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. - for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. - create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - - create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. if [ -n "$create_view_statement_test" ]; then echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" - create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ - | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ - | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ - | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"` all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" @@ -126,12 +130,17 @@ function copydb() { CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" - exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 5 + fi fi fi fi @@ -175,7 +184,9 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - exit 5 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 6 + fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else @@ -203,11 +214,14 @@ function copydb() { else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - exit 6 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 7 + fi fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" + echo -e "\n\nFinished processing db: ${db}\n" + print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 6fc0aa745..7957a659c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -6,6 +6,8 @@ then ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} fi +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0 + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' @@ -28,7 +30,9 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - exit 1 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 1 + fi fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -40,26 +44,26 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" # Set sed arguments. LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) -# Set the SED command arguments for column-names with reserved words: -DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' -DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. -DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' - -HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' -HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' -HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' - -LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' -LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' -LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' - export HADOOP_USER_NAME=$6 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" +function print_elapsed_time() +{ + start_time=$1 + end_time=$(date +%s) + elapsed_time=$(($end_time-$start_time)) + hours=$((elapsed_time / 3600)) + minutes=$(((elapsed_time % 3600) / 60)) + seconds=$((elapsed_time % 60)) + printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds +} + + function copydb() { db=$1 + start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). @@ -68,7 +72,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 2 + fi fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" @@ -92,7 +98,9 @@ function copydb() { else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - exit 3 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 3 + fi fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -110,17 +118,13 @@ function copydb() { num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. - for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. - create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - - create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. if [ -n "$create_view_statement_test" ]; then echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" - create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ - | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ - | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ - | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"` all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" @@ -128,12 +132,17 @@ function copydb() { CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" - exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 5 + fi fi fi fi @@ -177,7 +186,9 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - exit 5 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 6 + fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else @@ -205,11 +216,14 @@ function copydb() { else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - exit 6 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 7 + fi fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" + echo -e "\n\nFinished processing db: ${db}\n" + print_elapsed_time start_db_time } STATS_DB=$1 @@ -233,6 +247,6 @@ copydb $MONITOR_DB'_ris_tail' contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" for i in ${contexts} do - tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` + tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` copydb ${MONITOR_DB}'_'${tmp} done \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index eb16a161e..c0993ef0b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as with lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), - lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification') -select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 + lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'), + lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification') +select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4 from lvl1 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) - join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); + join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) + join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); + DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;