From 1f5aba12faefdfa5d56d38b58deea15b46b60ea9 Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 17 Apr 2024 23:54:23 +0300 Subject: [PATCH 1/9] slight optimization in indi_pub_gold_oa definition --- .../stats/oozie_app/scripts/step16-createIndicatorsTables.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 70cde6481..0845387d3 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -242,7 +242,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a select id, issn_online as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_online) foo ) SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold - FROM ${stats_db_name}.publication_datasources pd + FROM ${stats_db_name}.publication pd left outer join ( select pd.id, 1 as is_gold FROM ${stats_db_name}.publication_datasources pd -- 2.17.1 From 27d22bd8f945db559392fc1eabcfe185d4183aac Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 17 Apr 2024 23:59:52 +0300 Subject: [PATCH 2/9] slight optimization in indi_pub_gold_oa definition --- .../stats/oozie_app/scripts/step16-createIndicatorsTables.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 0845387d3..455c173ef 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -246,7 +246,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a left outer join ( select pd.id, 1 as is_gold FROM ${stats_db_name}.publication_datasources pd - join dd on dd.id=pd.datasource + left semi join dd on dd.id=pd.datasource left outer join ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/ -- 2.17.1 From 308ae580a97afba2cf19bf79d8022dbac33fc1e1 Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 18 Apr 2024 10:57:52 +0300 Subject: [PATCH 3/9] slight optimization in indi_pub_gold_oa definition --- .../scripts/step16-createIndicatorsTables.sql | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 455c173ef..18d66c6db 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -282,14 +282,17 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as -select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication pd +select distinct p.id, coalesce(is_hybrid, 0) is_hybrid +from ${stats_db_name}.publication p left outer join ( - select pd.id, 1 as is_hybrid from ${stats_db_name}.publication pd - join ${stats_db_name}.result_instance ri on ri.id=pd.id - join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=pd.id - join ${stats_db_name}.result_accessroute ra on ra.id=pd.id + select p.id, 1 as is_hybrid + from ${stats_db_name}.publication p + join ${stats_db_name}.result_instance ri on ri.id=p.id join ${stats_db_name}.datasource d on d.id=ri.hostedby - where indi_gold.is_gold=0 and ((d.type like '%Journal%' and ri.accessright!='Closed Access' and ri.accessright!='Restricted' and ri.license is not null) or ra.accessroute='hybrid')) tmp on pd.id=tmp.id; /*EOS*/ + join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=p.id + left outer join ${stats_db_name}.result_accessroute ra on ra.id=p.id + where indi_gold.is_gold=0 and + ((d.type like '%Journal%' and ri.accessright not in ('Closed Access', 'Restricted', 'Not Available') and ri.license is not null) or ra.accessroute='hybrid')) tmp on pd.i=tmp.id; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as -- 2.17.1 From e728a0897c88c4496f4533f713714387b9a1c25f Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 18 Apr 2024 11:07:55 +0300 Subject: [PATCH 4/9] fixed the definition of indi_pub_bronze_oa --- .../scripts/step16-createIndicatorsTables.sql | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 18d66c6db..ac14e2904 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -664,17 +664,18 @@ drop view pub_fos_totals; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/ create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as -select distinct pd.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication pd -left outer join (select pd.id, 1 as is_bronze_oa from ${stats_db_name}.publication pd -join ${stats_db_name}.result_instance ri on ri.id=pd.id -join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=pd.id -join ${stats_db_name}.indi_pub_hybrid indi_hybrid on indi_hybrid.id=pd.id -join ${stats_db_name}.result_accessroute ra on ra.id=pd.id -join ${stats_db_name}.datasource d on d.id=ri.hostedby -where indi_gold.is_gold=0 and indi_hybrid.is_hybrid=0 -and ((d.type like '%Journal%' and ri.accessright!='Closed Access' -and ri.accessright!='Restricted' and ri.license is null) or ra.accessroute='bronze')) tmp -on pd.id=tmp.id; /*EOS*/ +select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa +from ${stats_db_name}.publication p +left outer join ( + select p.id, 1 as is_bronze_oa + from ${stats_db_name}.publication p + join ${stats_db_name}.result_instance ri on ri.id=p.id + join ${stats_db_name}.datasource d on d.id=ri.hostedby + join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=p.id + join ${stats_db_name}.indi_pub_hybrid indi_hybrid on indi_hybrid.id=p.id + left outer join ${stats_db_name}.result_accessroute ra on ra.id=p.id + where indi_gold.is_gold=0 and indi_hybrid.is_hybrid=0 + and ((d.type like '%Journal%' and ri.accessright not in ('Closed Access', 'Restricted', 'Not Available') and ri.license is null) or ra.accessroute='bronze')) tmp on p.id=tmp.id; /*EOS*/ CREATE TEMPORARY VIEW project_year_result_year as select p.id project_id, acronym, r.id result_id, r.year, p.end_year -- 2.17.1 From 43d05dbebb4d0a8760dee242a1da0146b3698689 Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 18 Apr 2024 11:53:50 +0300 Subject: [PATCH 5/9] fixed the definition of result_country --- .../scripts/step16-createIndicatorsTables.sql | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index ac14e2904..9ea84023a 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1004,13 +1004,18 @@ left outer join ( drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ create table ${stats_db_name}.result_country stored as parquet as -select distinct ro.id, coalesce(o.country, f.country) -from ${stats_db_name}.result_organization ro -left outer join ${stats_db_name}.organization o on o.id=ro.organization -left outer join ${stats_db_name}.result_projects rp on rp.id=ro.id -left outer join ${stats_db_name}.project p on p.id=rp.project -left outer join ${stats_db_name}.funder f on f.name=p.funder -where coalesce(o.country, f.country) IS NOT NULL; +select distinct * +from ( + select ro.id, o.country + from ${stats_db_name}.result_organization ro + left outer join ${stats_db_name}.organization o on o.id=ro.organization + union all + select rp.id, f.country + from ${stats_db_name}.result_projects + left outer join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.funder f on f.name=p.funder + ) rc +where rc.country is not null; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as -- 2.17.1 From 0c71c58df69a23968b942fcc62d7d63e4cd3d551 Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 18 Apr 2024 12:01:27 +0300 Subject: [PATCH 6/9] fixed the definition of gold_oa --- .../stats/oozie_app/scripts/step16-createIndicatorsTables.sql | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 9ea84023a..65193a50c 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -247,7 +247,9 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a select pd.id, 1 as is_gold FROM ${stats_db_name}.publication_datasources pd left semi join dd on dd.id=pd.datasource - left outer join ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/ + union all + select ra.id, 1 as is_gold + from ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as -- 2.17.1 From c3fe9662b22e1a80ccb56479639338f79f8d1832 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 19 Apr 2024 12:45:36 +0300 Subject: [PATCH 7/9] all indicator tables are now stored as parquet --- .../oozie_app/scripts/step16-createIndicatorsTables.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 65193a50c..1a4002bcf 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -380,7 +380,7 @@ CREATE TEMPORARY VIEW allresults as drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/ -create table if not exists ${stats_db_name}.indi_org_fairness_pub as +create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness from allresults ar join result_fair rf on rf.organization=ar.organization; /*EOS*/ @@ -639,7 +639,7 @@ from ${stats_db_name}.publication p drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/ -create table if not exists ${stats_db_name}.indi_result_with_pid as +create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid from ${stats_db_name}.result p left outer join ( @@ -653,7 +653,7 @@ group by rf.id; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/ -create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity as +create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as select distinct p.id as id, coalesce(is_interdisciplinary, 0) as is_interdisciplinary from pub_fos_totals p -- 2.17.1 From d2649a1429ffcb7355d41696bd7abd2744d0a81b Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 23 Apr 2024 16:03:16 +0300 Subject: [PATCH 8/9] increased the jvm ram --- .../dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml index 022a107ab..b684b5e24 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml @@ -30,6 +30,10 @@ oozie.launcher.mapred.job.queue.name ${oozieLauncherQueueName} + + mapred.child.java.opts + -Xmx16g + -- 2.17.1 From 49af2e574088d24a799d4c22741a8f0e03455826 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 23 Apr 2024 17:15:04 +0300 Subject: [PATCH 9/9] Miscellaneous updates to the copying operation to Impala Cluster: - Update the algorithm for creating views that depend on other views; overcome some bash-instabilities. - Upon any error, fail the whole process, not just the current DB-creation, as those errors usually indicate a bug in the initial DB-creation, that should be fixed immediately. - Enhance parallel-copy of large files by "hadoop distcp" command. - Reduce the "invalidate metadata" commands to just the current DB's tables, in order to eliminate the general overhead on Impala. - Show the number of tables and views in the logs. - Fix some log-messages. --- .../oozie_app/copyDataToImpalaCluster.sh | 71 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 71 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 71 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 71 +++++++++---------- 4 files changed, 132 insertions(+), 152 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index 3d9986b64..059fb9089 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -67,24 +67,21 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - return 1 + exit 2 fi - # Make Impala aware of the deletion of the old DB immediately. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" - # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s - # Using max memory of: 50 * 6144 = 300 Gb + # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s + # Using max memory of: 70 * 6144 = 430 Gb # Using 1MB as a buffer-size. - # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop + # The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop # The "ug" args cannot be used as we get a "User does not belong to hive" error. # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ -numListstatusThreads 40 \ -copybuffersize 1048576 \ -strategy dynamic \ + -blocksperchunk 8 \ -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} @@ -92,9 +89,9 @@ function copydb() { if [ $? -eq 0 ]; then echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - return 2 + exit 3 fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -105,14 +102,11 @@ function copydb() { # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" - # Make Impala aware of the creation of the new DB immediately. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. all_create_view_statements=() + num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. @@ -129,9 +123,11 @@ function copydb() { all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" + ((num_tables++)) CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` @@ -142,74 +138,73 @@ function copydb() { fi done - echo -e "\nAll tables have been created, going to create the views..\n" + previous_num_of_views_to_retry=${#all_create_view_statements[@]} + if [[ $num_tables -gt 0 ]]; then + echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n" + else + echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n" + fi - # Time to loop through the views and create them. - # At this point all table-schemas should have been created. - - previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG - # Make Impala aware of the new tables, so it knows them when creating the views. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 + echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG else echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do + while [[ $previous_num_of_views_to_retry -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_statements=() + new_num_of_views_to_retry=0 for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" - echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_statements+=("$create_view_statement") + echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" + ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. else + all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_statements} + all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements. + # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. + if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - return 3 + exit 5 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" - previous_num_of_views_to_retry=$new_num_of_views_to_retry + echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + previous_num_of_views_to_retry=$new_num_of_views_to_retry done - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. + # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" + sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done + # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - return 4 + exit 6 fi rm -f error.log diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 2711d6e12..1130a684d 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -66,24 +66,21 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - return 1 + exit 2 fi - # Make Impala aware of the deletion of the old DB immediately. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" - # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s - # Using max memory of: 50 * 6144 = 300 Gb + # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s + # Using max memory of: 70 * 6144 = 430 Gb # Using 1MB as a buffer-size. - # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop + # The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop # The "ug" args cannot be used as we get a "User does not belong to hive" error. # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ -numListstatusThreads 40 \ -copybuffersize 1048576 \ -strategy dynamic \ + -blocksperchunk 8 \ -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} @@ -91,9 +88,9 @@ function copydb() { if [ $? -eq 0 ]; then echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - return 2 + exit 3 fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -104,14 +101,11 @@ function copydb() { # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" - # Make Impala aware of the creation of the new DB immediately. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. all_create_view_statements=() + num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. @@ -128,9 +122,11 @@ function copydb() { all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" + ((num_tables++)) CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` @@ -141,74 +137,73 @@ function copydb() { fi done - echo -e "\nAll tables have been created, going to create the views..\n" + previous_num_of_views_to_retry=${#all_create_view_statements[@]} + if [[ $num_tables -gt 0 ]]; then + echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n" + else + echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n" + fi - # Time to loop through the views and create them. - # At this point all table-schemas should have been created. - - previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG - # Make Impala aware of the new tables, so it knows them when creating the views. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 + echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG else echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do + while [[ $previous_num_of_views_to_retry -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_statements=() + new_num_of_views_to_retry=0 for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" - echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_statements+=("$create_view_statement") + echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" + ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. else + all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_statements} + all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements. + # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. + if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - return 3 + exit 5 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" - previous_num_of_views_to_retry=$new_num_of_views_to_retry + echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + previous_num_of_views_to_retry=$new_num_of_views_to_retry done - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. + # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" + sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done + # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - return 4 + exit 6 fi rm -f error.log diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 5ad9df762..de275145b 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -66,24 +66,21 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - return 1 + exit 2 fi - # Make Impala aware of the deletion of the old DB immediately. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" - # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s - # Using max memory of: 50 * 6144 = 300 Gb + # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s + # Using max memory of: 70 * 6144 = 430 Gb # Using 1MB as a buffer-size. - # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop + # The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop # The "ug" args cannot be used as we get a "User does not belong to hive" error. # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ -numListstatusThreads 40 \ -copybuffersize 1048576 \ -strategy dynamic \ + -blocksperchunk 8 \ -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} @@ -91,9 +88,9 @@ function copydb() { if [ $? -eq 0 ]; then echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - return 2 + exit 3 fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -104,14 +101,11 @@ function copydb() { # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" - # Make Impala aware of the creation of the new DB immediately. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. all_create_view_statements=() + num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. @@ -128,9 +122,11 @@ function copydb() { all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" + ((num_tables++)) CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` @@ -141,74 +137,73 @@ function copydb() { fi done - echo -e "\nAll tables have been created, going to create the views..\n" + previous_num_of_views_to_retry=${#all_create_view_statements[@]} + if [[ $num_tables -gt 0 ]]; then + echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n" + else + echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n" + fi - # Time to loop through the views and create them. - # At this point all table-schemas should have been created. - - previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG - # Make Impala aware of the new tables, so it knows them when creating the views. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 + echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG else echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do + while [[ $previous_num_of_views_to_retry -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_statements=() + new_num_of_views_to_retry=0 for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" - echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_statements+=("$create_view_statement") + echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" + ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. else + all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_statements} + all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements. + # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. + if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - return 3 + exit 5 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" - previous_num_of_views_to_retry=$new_num_of_views_to_retry + echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + previous_num_of_views_to_retry=$new_num_of_views_to_retry done - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. + # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" + sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done + # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - return 4 + exit 6 fi rm -f error.log diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index c2324b912..6fc0aa745 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -68,24 +68,21 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - return 1 + exit 2 fi - # Make Impala aware of the deletion of the old DB immediately. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" - # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s - # Using max memory of: 50 * 6144 = 300 Gb + # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s + # Using max memory of: 70 * 6144 = 430 Gb # Using 1MB as a buffer-size. - # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop + # The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop # The "ug" args cannot be used as we get a "User does not belong to hive" error. # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ -numListstatusThreads 40 \ -copybuffersize 1048576 \ -strategy dynamic \ + -blocksperchunk 8 \ -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} @@ -93,9 +90,9 @@ function copydb() { if [ $? -eq 0 ]; then echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - return 2 + exit 3 fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -106,14 +103,11 @@ function copydb() { # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" - # Make Impala aware of the creation of the new DB immediately. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. all_create_view_statements=() + num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. @@ -130,9 +124,11 @@ function copydb() { all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" + ((num_tables++)) CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` @@ -143,74 +139,73 @@ function copydb() { fi done - echo -e "\nAll tables have been created, going to create the views..\n" + previous_num_of_views_to_retry=${#all_create_view_statements[@]} + if [[ $num_tables -gt 0 ]]; then + echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n" + else + echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n" + fi - # Time to loop through the views and create them. - # At this point all table-schemas should have been created. - - previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG - # Make Impala aware of the new tables, so it knows them when creating the views. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 + echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG else echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do + while [[ $previous_num_of_views_to_retry -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_statements=() + new_num_of_views_to_retry=0 for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" - echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_statements+=("$create_view_statement") + echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n" + ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason. else + all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list. sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_statements} + all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements. + # Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters. + if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - return 3 + exit 5 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" - previous_num_of_views_to_retry=$new_num_of_views_to_retry + echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + previous_num_of_views_to_retry=$new_num_of_views_to_retry done - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. + # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" + sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done + # Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala) if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - return 4 + exit 6 fi rm -f error.log -- 2.17.1