From 3cad4a415d68c495fe723adadf9b43c46d72e80d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 11 Apr 2024 15:44:12 +0200 Subject: [PATCH 1/8] fixed duplicated property dhp-schemas.version --- pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7387f6e3b..8e6f16fe5 100644 --- a/pom.xml +++ b/pom.xml @@ -888,7 +888,6 @@ 3.3.3 3.4.2 [2.12,3.0) - [5.17.3] [6.1.0] [4.0.3] [6.0.5] From abf0b69f29a4473a582ee137bacdabd51fec5a3c Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 11 Apr 2024 17:12:12 +0300 Subject: [PATCH 2/8] Upgrade the copying operation to Impala Cluster: - Use only hive commands in the Ocean Cluster, as the "impala-shell" will be removed from there to free-up resources. - Hugely improve the performance in every aspect of the copying process: a) speedup file-transferring and DB-deletion, b) eliminate permissions-assignment, "load" operations and "use $db" queries, c) retry only the "create view" statements and only as long as they depend on other non-created views, instead of trying to recreate all tables and views 5 consecutive times. - Add error-checks for the creation of tables and views. --- .../oozie_app/copyDataToImpalaCluster.sh | 199 +++++++++++++---- .../oozie_app/copyDataToImpalaCluster.sh | 197 +++++++++++++---- .../oozie_app/copyDataToImpalaCluster.sh | 204 +++++++++++++---- .../oozie_app/copyDataToImpalaCluster.sh | 206 +++++++++++++----- 4 files changed, 623 insertions(+), 183 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index 6250aca81..3a8dd8fb6 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -8,9 +8,13 @@ fi export HADOOP_USER_NAME=$2 + +# Set the active HDFS node of OCEAN and IMPALA cluster. +OCEAN_HDFS_NODE='hdfs://nameservice1' +echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" + IMPALA_HDFS_NODE='' COUNTER=0 - while [ $COUNTER -lt 3 ]; do if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' @@ -24,71 +28,178 @@ while [ $COUNTER -lt 3 ]; do fi ((COUNTER++)) done - if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n" + echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' +IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' + +IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" + + +# Set sed arguments. +LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) + +# Set the SED command arguments for column-names with reserved words: +DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' +DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. +DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' + +HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' +HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' +HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' + +LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' +LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' +LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' + function copydb() { - - db=$1 - FILE=("hive_wf_tmp_"$RANDOM) - hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/ - # change ownership to impala -# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/ + # Delete the old DB from Impala cluster (if exists). + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + rm -f error.log + exit 2 + fi + # Make Impala aware of the deletion of the old DB immediately. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - # copy the databases from ocean to impala - echo "copying $db" - hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/ + echo "Copying $db files from Ocean to Impala cluster.." + # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s + # Using max memory of: 50 * 6144 = 300 Gb + # Using 1MB as a buffer-size. + # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop + # The "ug" args cannot be used as we get a "User does not belong to hive" error. + # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. + hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ + -numListstatusThreads 40 \ + -copybuffersize 1048576 \ + -strategy dynamic \ + -pb \ + ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. + #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - # drop tables from db - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`; - done + echo "Creating schema for ${db}" - # drop views from db - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`; - done + # create the new database (with the same name) + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" - # delete the database - impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + # Make Impala aware of the creation of the new DB immediately. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. + # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - # create the databases - impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + all_create_view_commands=() - impala-shell -q "INVALIDATE METADATA" - echo "creating schema for ${db}" - for (( k = 0; k < 5; k ++ )); do - for i in `impala-shell -d ${db} --delimited -q "show tables"`; - do - impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs. + # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. + create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + + create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` + if [ -n "$create_view_command_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" + create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ + | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ + | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + all_create_view_commands+=("$create_view_command") + else + echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" + CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` + if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. + echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + else + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + fi + fi + fi done - # load the data from /tmp in the respective tables - echo "copying data in tables and computing stats" - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; - done + echo -e "\nAll tables have been created, going to create the views..\n" - # deleting the remaining directory from hdfs -hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db + # Make Impala aware of the new tables. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + + # Time to loop through the views and create them. + # At this point all table-schemas should have been created. + echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n" # DEBUG + + should_retry=1 # Should retry creating the views (in case their tables where not created before them). + # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. + + previous_num_of_views_to_retry=${#all_create_view_commands} + + while ((should_retry)); do + + # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. + # In this case, we should retry creating this particular view again. + should_retry=0 # We should NOT do another iteration, unless at least one view could NOT be created. + + should_retry_create_view_commands=() + + for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` + echo -e "\nspecific_errors: ${specific_errors}\n" + if [ -n "$specific_errors" ]; then + echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" + should_retry=1 + should_retry_create_view_commands+=("$create_view_command") + else + sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. + fi + done + + echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + + new_num_of_views_to_retry=${#should_retry_create_view_commands} + if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then + echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + exit 3 + else + previous_num_of_views_to_retry=$new_num_of_views_to_retry + fi + + all_create_view_commands=$should_retry_create_view_command + done + + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + + echo "Computing stats for tables.." + for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do + # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. + create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; + fi + done + + rm -f error.log + + echo -e "\n\nFinished processing db: ${db}\n\n" } + MONITOR_DB=$1 #HADOOP_USER_NAME=$2 copydb $MONITOR_DB diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 97fa0dd9c..4ff2b746d 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -8,9 +8,12 @@ fi export HADOOP_USER_NAME=$2 +# Set the active HDFS node of OCEAN and IMPALA cluster. +OCEAN_HDFS_NODE='hdfs://nameservice1' +echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" + IMPALA_HDFS_NODE='' COUNTER=0 - while [ $COUNTER -lt 3 ]; do if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' @@ -24,70 +27,178 @@ while [ $COUNTER -lt 3 ]; do fi ((COUNTER++)) done - if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n" + echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' +IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' + +IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" + + +# Set sed arguments. +LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) + +# Set the SED command arguments for column-names with reserved words: +DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' +DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. +DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' + +HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' +HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' +HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' + +LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' +LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' +LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' + function copydb() { - db=$1 - FILE=("hive_wf_tmp_"$RANDOM) - hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/ - # change ownership to impala -# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/ + # Delete the old DB from Impala cluster (if exists). + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + rm -f error.log + exit 2 + fi + # Make Impala aware of the deletion of the old DB immediately. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - # copy the databases from ocean to impala - echo "copying $db" - hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/ + echo "Copying $db files from Ocean to Impala cluster.." + # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s + # Using max memory of: 50 * 6144 = 300 Gb + # Using 1MB as a buffer-size. + # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop + # The "ug" args cannot be used as we get a "User does not belong to hive" error. + # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. + hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ + -numListstatusThreads 40 \ + -copybuffersize 1048576 \ + -strategy dynamic \ + -pb \ + ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. + #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - # drop tables from db - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`; - done + echo "Creating schema for ${db}" - # drop views from db - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`; - done + # create the new database (with the same name) + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" - # delete the database - impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + # Make Impala aware of the creation of the new DB immediately. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. + # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - # create the databases - impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + all_create_view_commands=() - impala-shell -q "INVALIDATE METADATA" - echo "creating schema for ${db}" - for (( k = 0; k < 5; k ++ )); do - for i in `impala-shell -d ${db} --delimited -q "show tables"`; - do - impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs. + # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. + create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + + create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` + if [ -n "$create_view_command_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" + create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ + | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ + | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + all_create_view_commands+=("$create_view_command") + else + echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" + CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` + if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. + echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + else + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + fi + fi + fi done - # load the data from /tmp in the respective tables - echo "copying data in tables and computing stats" - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; - done + echo -e "\nAll tables have been created, going to create the views..\n" - # deleting the remaining directory from hdfs -hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db + # Make Impala aware of the new tables. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + + # Time to loop through the views and create them. + # At this point all table-schemas should have been created. + echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n" # DEBUG + + should_retry=1 # Should retry creating the views (in case their tables where not created before them). + # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. + + previous_num_of_views_to_retry=${#all_create_view_commands} + + while ((should_retry)); do + + # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. + # In this case, we should retry creating this particular view again. + should_retry=0 # We should NOT do another iteration, unless at least one view could NOT be created. + + should_retry_create_view_commands=() + + for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` + echo -e "\nspecific_errors: ${specific_errors}\n" + if [ -n "$specific_errors" ]; then + echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" + should_retry=1 + should_retry_create_view_commands+=("$create_view_command") + else + sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. + fi + done + + echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + + new_num_of_views_to_retry=${#should_retry_create_view_commands} + if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then + echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + exit 3 + else + previous_num_of_views_to_retry=$new_num_of_views_to_retry + fi + + all_create_view_commands=$should_retry_create_view_command + done + + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + + echo "Computing stats for tables.." + for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do + # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. + create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; + fi + done + + rm -f error.log + + echo -e "\n\nFinished processing db: ${db}\n\n" } + MONITOR_DB=$1 #HADOOP_USER_NAME=$2 copydb $MONITOR_DB diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 81ac088c0..a16f769e7 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -8,9 +8,12 @@ fi #export HADOOP_USER_NAME=$2 +# Set the active HDFS node of OCEAN and IMPALA cluster. +OCEAN_HDFS_NODE='hdfs://nameservice1' +echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" + IMPALA_HDFS_NODE='' COUNTER=0 - while [ $COUNTER -lt 3 ]; do if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' @@ -24,73 +27,182 @@ while [ $COUNTER -lt 3 ]; do fi ((COUNTER++)) done - if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n" + echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' +IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' + +IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" + + +# Set sed arguments. +LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) + +# Set the SED command arguments for column-names with reserved words: +DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' +DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. +DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' + +HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' +HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' +HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' + +LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' +LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' +LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' + + +export HADOOP_USER="dimitris.pierrakos" +export HADOOP_USER_NAME='dimitris.pierrakos' + function copydb() { - - export HADOOP_USER="dimitris.pierrakos" - export HADOOP_USER_NAME='dimitris.pierrakos' - db=$1 - FILE=("hive_wf_tmp_"$RANDOM) - hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/φ - # change ownership to impala -# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/ + # Delete the old DB from Impala cluster (if exists). + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + rm -f error.log + exit 2 + fi + # Make Impala aware of the deletion of the old DB immediately. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - # copy the databases from ocean to impala - echo "copying $db" - hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/ + echo "Copying $db files from Ocean to Impala cluster.." + # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s + # Using max memory of: 50 * 6144 = 300 Gb + # Using 1MB as a buffer-size. + # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop + # The "ug" args cannot be used as we get a "User does not belong to hive" error. + # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. + hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ + -numListstatusThreads 40 \ + -copybuffersize 1048576 \ + -strategy dynamic \ + -pb \ + ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. + #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - # drop tables from db - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`; - done + echo "Creating schema for ${db}" - # drop views from db - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`; - done + # create the new database (with the same name) + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" - # delete the database - impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + # Make Impala aware of the creation of the new DB immediately. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. + # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - # create the databases - impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + all_create_view_commands=() - impala-shell -q "INVALIDATE METADATA" - echo "creating schema for ${db}" - for (( k = 0; k < 5; k ++ )); do - for i in `impala-shell -d ${db} --delimited -q "show tables"`; - do - impala-shell -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs. + # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. + create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + + create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` + if [ -n "$create_view_command_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" + create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ + | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ + | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + all_create_view_commands+=("$create_view_command") + else + echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" + CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` + if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. + echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + else + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + fi + fi + fi done - # load the data from /tmp in the respective tables - echo "copying data in tables and computing stats" - for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; - impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; - done + echo -e "\nAll tables have been created, going to create the views..\n" - # deleting the remaining directory from hdfs -hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db + # Make Impala aware of the new tables. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + + # Time to loop through the views and create them. + # At this point all table-schemas should have been created. + echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n" # DEBUG + + should_retry=1 # Should retry creating the views (in case their tables where not created before them). + # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. + + previous_num_of_views_to_retry=${#all_create_view_commands} + + while ((should_retry)); do + + # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. + # In this case, we should retry creating this particular view again. + should_retry=0 # We should NOT do another iteration, unless at least one view could NOT be created. + + should_retry_create_view_commands=() + + for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` + echo -e "\nspecific_errors: ${specific_errors}\n" + if [ -n "$specific_errors" ]; then + echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" + should_retry=1 + should_retry_create_view_commands+=("$create_view_command") + else + sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. + fi + done + + echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + + new_num_of_views_to_retry=${#should_retry_create_view_commands} + if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then + echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + exit 3 + else + previous_num_of_views_to_retry=$new_num_of_views_to_retry + fi + + all_create_view_commands=$should_retry_create_view_command + done + + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + + echo "Computing stats for tables.." + for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do + # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. + create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; + fi + done + + rm -f error.log + + echo -e "\n\nFinished processing db: ${db}\n\n" } + MONITOR_DB=$1 #HADOOP_USER_NAME=$2 diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 3f8447b6c..0f248a79f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -6,9 +6,13 @@ then ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} fi + +# Set the active HDFS node of OCEAN and IMPALA cluster. +OCEAN_HDFS_NODE='hdfs://nameservice1' +echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" + IMPALA_HDFS_NODE='' COUNTER=0 - while [ $COUNTER -lt 3 ]; do if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' @@ -22,76 +26,178 @@ while [ $COUNTER -lt 3 ]; do fi ((COUNTER++)) done - if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n" + echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' +IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' + +IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" + +# Set sed arguments. +LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) + +# Set the SED command arguments for column-names with reserved words: +DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' +DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. +DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' + +HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' +HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' +HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' + +LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' +LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' +LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' + export HADOOP_USER_NAME=$6 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" + + function copydb() { db=$1 - FILE=("hive_wf_tmp_"$RANDOM) - hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/ - # copy the databases from ocean to impala - echo "copying $db" - hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/ + # Delete the old DB from Impala cluster (if exists). + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + rm -f error.log + exit 2 + fi - # change ownership to impala - hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + # Make Impala aware of the deletion of the old DB immediately. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - # drop tables from db - for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`; - done + echo "Copying $db files from Ocean to Impala cluster.." + # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s + # Using max memory of: 50 * 6144 = 300 Gb + # Using 1MB as a buffer-size. + # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop + # The "ug" args cannot be used as we get a "User does not belong to hive" error. + # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. + hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ + -numListstatusThreads 40 \ + -copybuffersize 1048576 \ + -strategy dynamic \ + -pb \ + ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} - # drop views from db - for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`; - done + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. + #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - # delete the database - impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + echo "Creating schema for ${db}" - # create the databases - impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + # create the new database (with the same name) + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" - impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA" - echo "creating schema for ${db}" - for (( k = 0; k < 5; k ++ )); do - for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; - do - impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; - done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + # Make Impala aware of the creation of the new DB immediately. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. + # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. + + all_create_view_commands=() + + for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs. + # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. + create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + + create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` + if [ -n "$create_view_command_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" + create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ + | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ + | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + all_create_view_commands+=("$create_view_command") + else + echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" + CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` + if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. + echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + else + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log + log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` + if [ -n "$log_errors" ]; then + echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + fi + fi + fi done -# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; -# do -# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; -# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - -# -# # run the same command twice because we may have failures in the first run (due to views pointing to the same db) -# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`; -# do -# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i"; -# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + echo -e "\nAll tables have been created, going to create the views..\n" - # load the data from /tmp in the respective tables - echo "copying data in tables and computing stats" - for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; - do - impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; - impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; - done + # Make Impala aware of the new tables. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 - # deleting the remaining directory from hdfs -hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db + # Time to loop through the views and create them. + # At this point all table-schemas should have been created. + echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n" # DEBUG + + should_retry=1 # Should retry creating the views (in case their tables where not created before them). + # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. + + previous_num_of_views_to_retry=${#all_create_view_commands} + + while ((should_retry)); do + + # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. + # In this case, we should retry creating this particular view again. + should_retry=0 # We should NOT do another iteration, unless at least one view could NOT be created. + + should_retry_create_view_commands=() + + for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` + echo -e "\nspecific_errors: ${specific_errors}\n" + if [ -n "$specific_errors" ]; then + echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" + should_retry=1 + should_retry_create_view_commands+=("$create_view_command") + else + sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. + fi + done + + echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + + new_num_of_views_to_retry=${#should_retry_create_view_commands} + if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then + echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + exit 3 + else + previous_num_of_views_to_retry=$new_num_of_views_to_retry + fi + + all_create_view_commands=$should_retry_create_view_command + done + + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 + + echo "Computing stats for tables.." + for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do + # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. + create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; + fi + done + + rm -f error.log + + echo -e "\n\nFinished processing db: ${db}\n\n" } STATS_DB=$1 From 22745027c8f900be793b342c70427adf8c959c91 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 11 Apr 2024 17:46:33 +0300 Subject: [PATCH 3/8] Use the "HADOOP_USER_NAME" value from the "workflow-property", in "copyDataToImpalaCluster.sh", in "stats-monitor-updates". --- .../stats-monitor/oozie_app/copyDataToImpalaCluster.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index a16f769e7..82c38bb65 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -6,7 +6,7 @@ then ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} fi -#export HADOOP_USER_NAME=$2 +export HADOOP_USER_NAME=$2 # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' @@ -56,10 +56,6 @@ LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' -export HADOOP_USER="dimitris.pierrakos" -export HADOOP_USER_NAME='dimitris.pierrakos' - - function copydb() { db=$1 @@ -204,7 +200,6 @@ function copydb() { MONITOR_DB=$1 -#HADOOP_USER_NAME=$2 copydb $MONITOR_DB'_institutions' copydb $MONITOR_DB From 14719dcd6202233dd076fc377094e7d48cdc1d22 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 12 Apr 2024 15:36:13 +0300 Subject: [PATCH 4/8] Miscellaneous updates to the copying operation to Impala Cluster: - Update the algorithm for creating views that depend on other views. - Add check for successful execution of the "hadoop distcp" command. - Add a check for successful copy operation of all entities. - Upon facing an error in a DB, exit the method, instead of the whole script. - Improve logging. - Code polishing. --- .../oozie_app/copyDataToImpalaCluster.sh | 60 ++++++++++++------- .../oozie_app/copyDataToImpalaCluster.sh | 60 ++++++++++++------- .../oozie_app/copyDataToImpalaCluster.sh | 60 ++++++++++++------- .../oozie_app/copyDataToImpalaCluster.sh | 60 ++++++++++++------- 4 files changed, 160 insertions(+), 80 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index 3a8dd8fb6..fceb1b76b 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -66,7 +66,7 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + return 1 fi # Make Impala aware of the deletion of the old DB immediately. @@ -87,6 +87,15 @@ function copydb() { -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} + # Check the exit status of the "hadoop distcp" command. + if [ $? -eq 0 ]; then + echo "Successfully copied the files of '${db}'." + else + echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + rm -f error.log + return 2 + fi + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db @@ -104,7 +113,8 @@ function copydb() { all_create_view_commands=() - for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs. + entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. @@ -140,45 +150,44 @@ function copydb() { # Time to loop through the views and create them. # At this point all table-schemas should have been created. - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n" # DEBUG - - should_retry=1 # Should retry creating the views (in case their tables where not created before them). - # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. previous_num_of_views_to_retry=${#all_create_view_commands} + if [[ $previous_num_of_views_to_retry -gt 0 ]]; then + echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + else + echo -e "\nDB '${db}' does not contain views.\n" + fi - while ((should_retry)); do - + level_counter=0 + while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry=0 # We should NOT do another iteration, unless at least one view could NOT be created. - should_retry_create_view_commands=() - for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs. + for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` - echo -e "\nspecific_errors: ${specific_errors}\n" if [ -n "$specific_errors" ]; then + echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry=1 should_retry_create_view_commands+=("$create_view_command") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" - new_num_of_views_to_retry=${#should_retry_create_view_commands} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." - exit 3 - else + return 3 + elif [[ $new_num_of_views_to_retry -gt 0 ]]; then + echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry + else + echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" fi - - all_create_view_commands=$should_retry_create_view_command + all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 @@ -186,7 +195,10 @@ function copydb() { sleep 1 echo "Computing stats for tables.." - for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do + + entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` + + for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. @@ -194,6 +206,14 @@ function copydb() { fi done + if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then + echo -e "\nAll entities have been copied to Impala cluster.\n" + else + echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + rm -f error.log + return 4 + fi + rm -f error.log echo -e "\n\nFinished processing db: ${db}\n\n" diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 4ff2b746d..7ff6a5d52 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -65,7 +65,7 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + return 1 fi # Make Impala aware of the deletion of the old DB immediately. @@ -86,6 +86,15 @@ function copydb() { -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} + # Check the exit status of the "hadoop distcp" command. + if [ $? -eq 0 ]; then + echo "Successfully copied the files of '${db}'." + else + echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + rm -f error.log + return 2 + fi + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db @@ -103,7 +112,8 @@ function copydb() { all_create_view_commands=() - for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs. + entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. @@ -139,45 +149,44 @@ function copydb() { # Time to loop through the views and create them. # At this point all table-schemas should have been created. - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n" # DEBUG - - should_retry=1 # Should retry creating the views (in case their tables where not created before them). - # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. previous_num_of_views_to_retry=${#all_create_view_commands} + if [[ $previous_num_of_views_to_retry -gt 0 ]]; then + echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + else + echo -e "\nDB '${db}' does not contain views.\n" + fi - while ((should_retry)); do - + level_counter=0 + while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry=0 # We should NOT do another iteration, unless at least one view could NOT be created. - should_retry_create_view_commands=() - for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs. + for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` - echo -e "\nspecific_errors: ${specific_errors}\n" if [ -n "$specific_errors" ]; then + echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry=1 should_retry_create_view_commands+=("$create_view_command") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" - new_num_of_views_to_retry=${#should_retry_create_view_commands} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." - exit 3 - else + return 3 + elif [[ $new_num_of_views_to_retry -gt 0 ]]; then + echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry + else + echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" fi - - all_create_view_commands=$should_retry_create_view_command + all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 @@ -185,7 +194,10 @@ function copydb() { sleep 1 echo "Computing stats for tables.." - for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do + + entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` + + for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. @@ -193,6 +205,14 @@ function copydb() { fi done + if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then + echo -e "\nAll entities have been copied to Impala cluster.\n" + else + echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + rm -f error.log + return 4 + fi + rm -f error.log echo -e "\n\nFinished processing db: ${db}\n\n" diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 82c38bb65..8900adcb5 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -65,7 +65,7 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + return 1 fi # Make Impala aware of the deletion of the old DB immediately. @@ -86,6 +86,15 @@ function copydb() { -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} + # Check the exit status of the "hadoop distcp" command. + if [ $? -eq 0 ]; then + echo "Successfully copied the files of '${db}'." + else + echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + rm -f error.log + return 2 + fi + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db @@ -103,7 +112,8 @@ function copydb() { all_create_view_commands=() - for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs. + entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. @@ -139,45 +149,44 @@ function copydb() { # Time to loop through the views and create them. # At this point all table-schemas should have been created. - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n" # DEBUG - - should_retry=1 # Should retry creating the views (in case their tables where not created before them). - # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. previous_num_of_views_to_retry=${#all_create_view_commands} + if [[ $previous_num_of_views_to_retry -gt 0 ]]; then + echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + else + echo -e "\nDB '${db}' does not contain views.\n" + fi - while ((should_retry)); do - + level_counter=0 + while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry=0 # We should NOT do another iteration, unless at least one view could NOT be created. - should_retry_create_view_commands=() - for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs. + for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` - echo -e "\nspecific_errors: ${specific_errors}\n" if [ -n "$specific_errors" ]; then + echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry=1 should_retry_create_view_commands+=("$create_view_command") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" - new_num_of_views_to_retry=${#should_retry_create_view_commands} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." - exit 3 - else + return 3 + elif [[ $new_num_of_views_to_retry -gt 0 ]]; then + echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry + else + echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" fi - - all_create_view_commands=$should_retry_create_view_command + all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 @@ -185,7 +194,10 @@ function copydb() { sleep 1 echo "Computing stats for tables.." - for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do + + entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` + + for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. @@ -193,6 +205,14 @@ function copydb() { fi done + if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then + echo -e "\nAll entities have been copied to Impala cluster.\n" + else + echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + rm -f error.log + return 4 + fi + rm -f error.log echo -e "\n\nFinished processing db: ${db}\n\n" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 0f248a79f..bbb5e43ee 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -67,7 +67,7 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + return 1 fi # Make Impala aware of the deletion of the old DB immediately. @@ -88,6 +88,15 @@ function copydb() { -pb \ ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} + # Check the exit status of the "hadoop distcp" command. + if [ $? -eq 0 ]; then + echo "Successfully copied the files of '${db}'." + else + echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + rm -f error.log + return 2 + fi + # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db @@ -105,7 +114,8 @@ function copydb() { all_create_view_commands=() - for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs. + entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. @@ -141,45 +151,44 @@ function copydb() { # Time to loop through the views and create them. # At this point all table-schemas should have been created. - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n" # DEBUG - - should_retry=1 # Should retry creating the views (in case their tables where not created before them). - # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry.. previous_num_of_views_to_retry=${#all_create_view_commands} + if [[ $previous_num_of_views_to_retry -gt 0 ]]; then + echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + else + echo -e "\nDB '${db}' does not contain views.\n" + fi - while ((should_retry)); do - + level_counter=0 + while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry=0 # We should NOT do another iteration, unless at least one view could NOT be created. - should_retry_create_view_commands=() - for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs. + for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` - echo -e "\nspecific_errors: ${specific_errors}\n" if [ -n "$specific_errors" ]; then + echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry=1 should_retry_create_view_commands+=("$create_view_command") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" - new_num_of_views_to_retry=${#should_retry_create_view_commands} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." - exit 3 - else + return 3 + elif [[ $new_num_of_views_to_retry -gt 0 ]]; then + echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry + else + echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" fi - - all_create_view_commands=$should_retry_create_view_command + all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 @@ -187,7 +196,10 @@ function copydb() { sleep 1 echo "Computing stats for tables.." - for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do + + entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` + + for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. @@ -195,6 +207,14 @@ function copydb() { fi done + if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then + echo -e "\nAll entities have been copied to Impala cluster.\n" + else + echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + rm -f error.log + return 4 + fi + rm -f error.log echo -e "\n\nFinished processing db: ${db}\n\n" From d7da4f814ba17f71e09c047c0de8cbcd90c481b8 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 12 Apr 2024 18:12:06 +0300 Subject: [PATCH 5/8] Minor updates to the copying operation to Impala Cluster: - Improve logging. - Code optimization/polishing. --- .../oozie_app/copyDataToImpalaCluster.sh | 83 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 83 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 83 +++++++++---------- .../oozie_app/copyDataToImpalaCluster.sh | 83 +++++++++---------- 4 files changed, 160 insertions(+), 172 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index fceb1b76b..3d9986b64 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -11,7 +11,7 @@ export HADOOP_USER_NAME=$2 # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' -echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" +echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" IMPALA_HDFS_NODE='' COUNTER=0 @@ -29,10 +29,10 @@ while [ $COUNTER -lt 3 ]; do ((COUNTER++)) done if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" + echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' @@ -59,12 +59,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' function copydb() { db=$1 + echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log return 1 fi @@ -73,7 +74,7 @@ function copydb() { sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo "Copying $db files from Ocean to Impala cluster.." + echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max memory of: 50 * 6144 = 300 Gb # Using 1MB as a buffer-size. @@ -89,9 +90,9 @@ function copydb() { # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then - echo "Successfully copied the files of '${db}'." + echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" rm -f error.log return 2 fi @@ -99,7 +100,7 @@ function copydb() { # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - echo "Creating schema for ${db}" + echo -e "\nCreating schema for db: '${db}'\n" # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -111,31 +112,31 @@ function copydb() { # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - all_create_view_commands=() + all_create_view_statements=() entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. - # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. - create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. + create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` - if [ -n "$create_view_command_test" ]; then - echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" - create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + if [ -n "$create_view_statement_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` - all_create_view_commands+=("$create_view_command") + all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. - echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" fi fi fi @@ -143,65 +144,62 @@ function copydb() { echo -e "\nAll tables have been created, going to create the views..\n" - # Make Impala aware of the new tables. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - # Time to loop through the views and create them. # At this point all table-schemas should have been created. - previous_num_of_views_to_retry=${#all_create_view_commands} + previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG + # Make Impala aware of the new tables, so it knows them when creating the views. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 else - echo -e "\nDB '${db}' does not contain views.\n" + echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_commands=() + should_retry_create_view_statements=() - for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_commands+=("$create_view_command") + should_retry_create_view_statements+=("$create_view_statement") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_commands} + new_num_of_views_to_retry=${#should_retry_create_view_statements} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" return 3 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry else - echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" + echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" sleep 1 - echo "Computing stats for tables.." - + echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` - for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. - create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. - if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done @@ -209,13 +207,12 @@ function copydb() { if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else - echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log return 4 fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" } diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 7ff6a5d52..2711d6e12 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -10,7 +10,7 @@ export HADOOP_USER_NAME=$2 # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' -echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" +echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" IMPALA_HDFS_NODE='' COUNTER=0 @@ -28,10 +28,10 @@ while [ $COUNTER -lt 3 ]; do ((COUNTER++)) done if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" + echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' @@ -58,12 +58,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' function copydb() { db=$1 + echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log return 1 fi @@ -72,7 +73,7 @@ function copydb() { sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo "Copying $db files from Ocean to Impala cluster.." + echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max memory of: 50 * 6144 = 300 Gb # Using 1MB as a buffer-size. @@ -88,9 +89,9 @@ function copydb() { # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then - echo "Successfully copied the files of '${db}'." + echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" rm -f error.log return 2 fi @@ -98,7 +99,7 @@ function copydb() { # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - echo "Creating schema for ${db}" + echo -e "\nCreating schema for db: '${db}'\n" # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -110,31 +111,31 @@ function copydb() { # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - all_create_view_commands=() + all_create_view_statements=() entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. - # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. - create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. + create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` - if [ -n "$create_view_command_test" ]; then - echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" - create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + if [ -n "$create_view_statement_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` - all_create_view_commands+=("$create_view_command") + all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. - echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" fi fi fi @@ -142,65 +143,62 @@ function copydb() { echo -e "\nAll tables have been created, going to create the views..\n" - # Make Impala aware of the new tables. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - # Time to loop through the views and create them. # At this point all table-schemas should have been created. - previous_num_of_views_to_retry=${#all_create_view_commands} + previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG + # Make Impala aware of the new tables, so it knows them when creating the views. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 else - echo -e "\nDB '${db}' does not contain views.\n" + echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_commands=() + should_retry_create_view_statements=() - for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_commands+=("$create_view_command") + should_retry_create_view_statements+=("$create_view_statement") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_commands} + new_num_of_views_to_retry=${#should_retry_create_view_statements} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" return 3 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry else - echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" + echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" sleep 1 - echo "Computing stats for tables.." - + echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` - for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. - create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. - if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done @@ -208,13 +206,12 @@ function copydb() { if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else - echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log return 4 fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" } diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 8900adcb5..5ad9df762 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -10,7 +10,7 @@ export HADOOP_USER_NAME=$2 # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' -echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" +echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" IMPALA_HDFS_NODE='' COUNTER=0 @@ -28,10 +28,10 @@ while [ $COUNTER -lt 3 ]; do ((COUNTER++)) done if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" + echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' @@ -58,12 +58,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' function copydb() { db=$1 + echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log return 1 fi @@ -72,7 +73,7 @@ function copydb() { sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo "Copying $db files from Ocean to Impala cluster.." + echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max memory of: 50 * 6144 = 300 Gb # Using 1MB as a buffer-size. @@ -88,9 +89,9 @@ function copydb() { # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then - echo "Successfully copied the files of '${db}'." + echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" rm -f error.log return 2 fi @@ -98,7 +99,7 @@ function copydb() { # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - echo "Creating schema for ${db}" + echo -e "\nCreating schema for db: '${db}'\n" # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -110,31 +111,31 @@ function copydb() { # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - all_create_view_commands=() + all_create_view_statements=() entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. - # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. - create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. + create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` - if [ -n "$create_view_command_test" ]; then - echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" - create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + if [ -n "$create_view_statement_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` - all_create_view_commands+=("$create_view_command") + all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. - echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" fi fi fi @@ -142,65 +143,62 @@ function copydb() { echo -e "\nAll tables have been created, going to create the views..\n" - # Make Impala aware of the new tables. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - # Time to loop through the views and create them. # At this point all table-schemas should have been created. - previous_num_of_views_to_retry=${#all_create_view_commands} + previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG + # Make Impala aware of the new tables, so it knows them when creating the views. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 else - echo -e "\nDB '${db}' does not contain views.\n" + echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_commands=() + should_retry_create_view_statements=() - for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_commands+=("$create_view_command") + should_retry_create_view_statements+=("$create_view_statement") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_commands} + new_num_of_views_to_retry=${#should_retry_create_view_statements} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" return 3 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry else - echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" + echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" sleep 1 - echo "Computing stats for tables.." - + echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` - for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. - create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. - if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done @@ -208,13 +206,12 @@ function copydb() { if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else - echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log return 4 fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" } diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index bbb5e43ee..c2324b912 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -9,7 +9,7 @@ fi # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' -echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" +echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" IMPALA_HDFS_NODE='' COUNTER=0 @@ -27,10 +27,10 @@ while [ $COUNTER -lt 3 ]; do ((COUNTER++)) done if [ -z "$IMPALA_HDFS_NODE" ]; then - echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" + echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" exit 1 fi -echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries." +echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu' IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' @@ -60,12 +60,13 @@ export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" function copydb() { db=$1 + echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log return 1 fi @@ -74,7 +75,7 @@ function copydb() { sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - echo "Copying $db files from Ocean to Impala cluster.." + echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max memory of: 50 * 6144 = 300 Gb # Using 1MB as a buffer-size. @@ -90,9 +91,9 @@ function copydb() { # Check the exit status of the "hadoop distcp" command. if [ $? -eq 0 ]; then - echo "Successfully copied the files of '${db}'." + echo -e "\nSuccessfully copied the files of '${db}'.\n" else - echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?" + echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" rm -f error.log return 2 fi @@ -100,7 +101,7 @@ function copydb() { # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db - echo "Creating schema for ${db}" + echo -e "\nCreating schema for db: '${db}'\n" # create the new database (with the same name) impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" @@ -112,31 +113,31 @@ function copydb() { # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table " output from hive to create the exact same table in impala. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. - all_create_view_commands=() + all_create_view_statements=() entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. - # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command. - create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command. + # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. + create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'` - if [ -n "$create_view_command_test" ]; then - echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n" - create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \ + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + if [ -n "$create_view_statement_test" ]; then + echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` - all_create_view_commands+=("$create_view_command") + all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. - echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" + echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then - echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" fi fi fi @@ -144,65 +145,62 @@ function copydb() { echo -e "\nAll tables have been created, going to create the views..\n" - # Make Impala aware of the new tables. - sleep 1 - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" - sleep 1 - # Time to loop through the views and create them. # At this point all table-schemas should have been created. - previous_num_of_views_to_retry=${#all_create_view_commands} + previous_num_of_views_to_retry=${#all_create_view_statements} if [[ $previous_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n" # DEBUG + echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG + # Make Impala aware of the new tables, so it knows them when creating the views. + sleep 1 + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" + sleep 1 else - echo -e "\nDB '${db}' does not contain views.\n" + echo -e "\nDB '${db}' does not contain any views.\n" fi level_counter=0 - while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do + while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do ((level_counter++)) # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # In this case, we should retry creating this particular view again. - should_retry_create_view_commands=() + should_retry_create_view_statements=() - for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. - impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later + for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. + impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` if [ -n "$specific_errors" ]; then echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" - should_retry_create_view_commands+=("$create_view_command") + should_retry_create_view_statements+=("$create_view_statement") else sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. fi done - new_num_of_views_to_retry=${#should_retry_create_view_commands} + new_num_of_views_to_retry=${#should_retry_create_view_statements} if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then - echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.." + echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" return 3 elif [[ $new_num_of_views_to_retry -gt 0 ]]; then - echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n" + echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" previous_num_of_views_to_retry=$new_num_of_views_to_retry else - echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n" + echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" fi - all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. + all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. done sleep 1 impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA" sleep 1 - echo "Computing stats for tables.." - + echo -e "\nComputing stats for tables..\n" entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` - for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. - create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. - if [ -z "$create_view_command" ]; then # If it's a table, then go load the data to it. + create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. + if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; fi done @@ -210,13 +208,12 @@ function copydb() { if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then echo -e "\nAll entities have been copied to Impala cluster.\n" else - echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" + echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log return 4 fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" } From 43b454399f2099912dcc31961f03dce6ce2b41cd Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 15 Apr 2024 18:19:29 +0200 Subject: [PATCH 6/8] - Bug fix in matchOrderedTokenAndAbbreviations algorithms where tokens with same initial character were always considered equal - AuthorsMatch exploits the new matching strategy used for ORCID enhancements in #PR398: split author names in tokens, order the tokens, then check for matches of ordered full tokens or abbreviations --- .../dhp/schema/oaf/utils/MergeUtils.java | 7 +- .../eu/dnetlib/pace/tree/AuthorsMatch.java | 45 ++++-- .../eu/dnetlib/pace/util/AuthorMatchers.scala | 53 ++++++- .../dnetlib/dhp/oa/dedup/SparkDedupTest.java | 145 ++++++++++-------- .../enrich/orcid/ORCIDAuthorEnricher.scala | 11 +- .../orcid/ORCIDAuthorMatchersTest.scala | 2 +- 6 files changed, 169 insertions(+), 94 deletions(-) rename dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala => dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala (56%) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index 0ff90e024..316891faf 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -497,9 +497,14 @@ public class MergeUtils { } private static Field selectOldestDate(Field d1, Field d2) { + if (d1 == null || StringUtils.isBlank(d1.getValue())) { + return d2; + } else if (d2 == null || StringUtils.isBlank(d2.getValue())) { + return d1; + } + return Stream .of(d1, d2) - .filter(Objects::nonNull) .min( Comparator .comparing( diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index edad0ae2e..0921d7a64 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -1,16 +1,18 @@ package eu.dnetlib.pace.tree; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - import com.wcohen.ss.AbstractStringDistance; - import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.tree.support.AbstractListComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.util.AuthorMatchers; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.stream.Collectors; @ComparatorClass("authorsMatch") public class AuthorsMatch extends AbstractListComparator { @@ -41,24 +43,36 @@ public class AuthorsMatch extends AbstractListComparator { } @Override - public double compare(final List a, final List b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) + public double compare(final List left, final List right, final Config conf) { + if (left.isEmpty() || right.isEmpty()) return -1; - if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD) + if (left.size() > SIZE_THRESHOLD || right.size() > SIZE_THRESHOLD) return 1.0; - int maxMiss = Integer.MAX_VALUE; - List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); - Double threshold = getDoubleParam("threshold"); + int maxMiss = Integer.MAX_VALUE; - if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) { - maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size())); + if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && left.size() == right.size()) { + maxMiss = (int) Math.floor((1 - threshold) * Math.max(left.size(), right.size())); } int common = 0; + + List a = new ArrayList<>(left); + List b = new ArrayList<>(right); + + common += AuthorMatchers + .removeMatches(a, b, (BiFunction) AuthorMatchers::matchEqualsIgnoreCase) + .size() / 2; + common += AuthorMatchers + .removeMatches(a, b, (BiFunction) AuthorMatchers::matchOrderedTokenAndAbbreviations) + .size() / 2; + + List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); + // compare each element of List1 with each element of List2 + int alreadyMatched = common; for (int i = 0; i < a.size(); i++) { Person p1 = new Person(a.get(i), false); @@ -123,13 +137,13 @@ public class AuthorsMatch extends AbstractListComparator { } } - if (i - common > maxMiss) { + if (i - common - alreadyMatched > maxMiss) { return 0.0; } } // normalization factor to compute the score - int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common); + int normFactor = left.size() == right.size() ? left.size() : (left.size() + right.size() - common); if (TYPE.equals("percentage")) { return (double) common / normFactor; @@ -160,5 +174,4 @@ public class AuthorsMatch extends AbstractListComparator { public String normalization(String s) { return normalize(utf8(cleanup(s))); } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala similarity index 56% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala rename to dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala index 49574fe2d..116f515ed 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala @@ -1,9 +1,10 @@ -package eu.dnetlib.dhp.enrich.orcid +package eu.dnetlib.pace.util import java.util.Locale import java.util.regex.Pattern +import scala.util.control.Breaks.{break, breakable} -object ORCIDAuthorMatchers { +object AuthorMatchers { val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+") val WORD_DIFF = 2 @@ -45,7 +46,8 @@ object ORCIDAuthorMatchers { var res: Boolean = false if (e1.length != 1 && e2.length != 1) { res = e1 == e2 - longMatches += 1 + if (res) + longMatches += 1 } else { res = true shortMatches += 1 @@ -62,4 +64,49 @@ object ORCIDAuthorMatchers { } longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length) } + + def removeMatches( + graph_authors: java.util.List[String], + orcid_authors: java.util.List[String], + matchingFunc: java.util.function.BiFunction[String,String,Boolean] + ) : java.util.List[String] = { + removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b)) + } + + + def removeMatches( + graph_authors: java.util.List[String], + orcid_authors: java.util.List[String], + matchingFunc: (String, String) => Boolean + ) : java.util.List[String] = { + val matched = new java.util.ArrayList[String]() + + if (graph_authors != null && !graph_authors.isEmpty) { + val ait = graph_authors.iterator + + while (ait.hasNext) { + val author = ait.next() + val oit = orcid_authors.iterator + + breakable { + while (oit.hasNext) { + val orcid = oit.next() + + if (matchingFunc(author, orcid)) { + ait.remove() + oit.remove() + + matched.add(author) + matched.add(orcid) + + break() + } + } + } + } + } + + matched + } + } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 8b3480e60..2c96b7399 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -43,15 +43,13 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; -import eu.dnetlib.dhp.schema.sx.OafUtils; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import scala.Tuple2; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) public class SparkDedupTest implements Serializable { + static final boolean CHECK_CARDINALITIES = true; @Mock(serializable = true) ISLookUpService isLookUpService; @@ -191,12 +189,13 @@ public class SparkDedupTest implements Serializable { System.out.println("ds_simrel = " + ds_simrel); System.out.println("orp_simrel = " + orp_simrel); - assertEquals(751, orgs_simrel); - assertEquals(546, pubs_simrel); - assertEquals(113, sw_simrel); - assertEquals(148, ds_simrel); - assertEquals(280, orp_simrel); - + if (CHECK_CARDINALITIES) { + assertEquals(751, orgs_simrel); + assertEquals(566, pubs_simrel); + assertEquals(113, sw_simrel); + assertEquals(148, ds_simrel); + assertEquals(280, orp_simrel); + } } @Test @@ -239,21 +238,27 @@ public class SparkDedupTest implements Serializable { .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct")) .count(); - // entities simrels supposed to be equal to the number of previous step (no rels in whitelist) - assertEquals(751, orgs_simrel); - assertEquals(546, pubs_simrel); - assertEquals(148, ds_simrel); - assertEquals(280, orp_simrel); -// System.out.println("orgs_simrel = " + orgs_simrel); -// System.out.println("pubs_simrel = " + pubs_simrel); -// System.out.println("ds_simrel = " + ds_simrel); -// System.out.println("orp_simrel = " + orp_simrel); - // entities simrels to be different from the number of previous step (new simrels in the whitelist) Dataset sw_simrel = spark .read() .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "software")); + System.out.println("orgs_simrel = " + orgs_simrel); + System.out.println("pubs_simrel = " + pubs_simrel); + System.out.println("ds_simrel = " + ds_simrel); + System.out.println("orp_simrel = " + orp_simrel); + System.out.println("sw_simrel = " + sw_simrel.count()); + + // entities simrels supposed to be equal to the number of previous step (no rels in whitelist) + if (CHECK_CARDINALITIES) { + assertEquals(751, orgs_simrel); + assertEquals(566, pubs_simrel); + assertEquals(148, ds_simrel); + assertEquals(280, orp_simrel); + assertEquals(115, sw_simrel.count()); + } + + // check if the first relation in the whitelist exists assertTrue( sw_simrel @@ -272,10 +277,6 @@ public class SparkDedupTest implements Serializable { rel -> rel.getSource().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[0]) && rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1])) .count() > 0); - - assertEquals(115, sw_simrel.count()); -// System.out.println("sw_simrel = " + sw_simrel.count()); - } @Test @@ -466,17 +467,19 @@ public class SparkDedupTest implements Serializable { assertTrue(dups.contains(r.getSource())); }); - assertEquals(1268, orgs_mergerel); - assertEquals(1112, pubs.count()); - assertEquals(292, sw_mergerel); - assertEquals(476, ds_mergerel); - assertEquals(742, orp_mergerel); -// System.out.println("orgs_mergerel = " + orgs_mergerel); -// System.out.println("pubs_mergerel = " + pubs_mergerel); -// System.out.println("sw_mergerel = " + sw_mergerel); -// System.out.println("ds_mergerel = " + ds_mergerel); -// System.out.println("orp_mergerel = " + orp_mergerel); + System.out.println("orgs_mergerel = " + orgs_mergerel); + System.out.println("pubs_mergerel = " + pubs.count()); + System.out.println("sw_mergerel = " + sw_mergerel); + System.out.println("ds_mergerel = " + ds_mergerel); + System.out.println("orp_mergerel = " + orp_mergerel); + if (CHECK_CARDINALITIES) { + assertEquals(1268, orgs_mergerel); + assertEquals(1156, pubs.count()); + assertEquals(292, sw_mergerel); + assertEquals(476, ds_mergerel); + assertEquals(742, orp_mergerel); + } } @Test @@ -552,17 +555,19 @@ public class SparkDedupTest implements Serializable { assertTrue(dups.contains(r.getSource())); }); - assertEquals(1268, orgs_mergerel); - assertEquals(1112, pubs.count()); - assertEquals(292, sw_mergerel); - assertEquals(476, ds_mergerel); - assertEquals(742, orp_mergerel); -// System.out.println("orgs_mergerel = " + orgs_mergerel); -// System.out.println("pubs_mergerel = " + pubs_mergerel); -// System.out.println("sw_mergerel = " + sw_mergerel); -// System.out.println("ds_mergerel = " + ds_mergerel); -// System.out.println("orp_mergerel = " + orp_mergerel); + System.out.println("orgs_mergerel = " + orgs_mergerel); + System.out.println("pubs_mergerel = " + pubs.count()); + System.out.println("sw_mergerel = " + sw_mergerel); + System.out.println("ds_mergerel = " + ds_mergerel); + System.out.println("orp_mergerel = " + orp_mergerel); + if (CHECK_CARDINALITIES) { + assertEquals(1268, orgs_mergerel); + assertEquals(1156, pubs.count()); + assertEquals(292, sw_mergerel); + assertEquals(476, ds_mergerel); + assertEquals(742, orp_mergerel); + } } @Test @@ -607,19 +612,21 @@ public class SparkDedupTest implements Serializable { testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") .count(); - assertEquals(86, orgs_deduprecord); - assertEquals(91, pubs.count()); - assertEquals(47, sw_deduprecord); - assertEquals(97, ds_deduprecord); - assertEquals(92, orp_deduprecord); + System.out.println("orgs_deduprecord = " + orgs_deduprecord); + System.out.println("pubs_deduprecord = " + pubs.count()); + System.out.println("sw_deduprecord = " + sw_deduprecord); + System.out.println("ds_deduprecord = " + ds_deduprecord); + System.out.println("orp_deduprecord = " + orp_deduprecord); + + if (CHECK_CARDINALITIES) { + assertEquals(86, orgs_deduprecord); + assertEquals(96, pubs.count()); + assertEquals(47, sw_deduprecord); + assertEquals(97, ds_deduprecord); + assertEquals(92, orp_deduprecord); + } verifyRoot_1(mapper, pubs); - -// System.out.println("orgs_deduprecord = " + orgs_deduprecord); -// System.out.println("pubs_deduprecord = " + pubs_deduprecord); -// System.out.println("sw_deduprecord = " + sw_deduprecord); -// System.out.println("ds_deduprecord = " + ds_deduprecord); -// System.out.println("orp_deduprecord = " + orp_deduprecord); } private static void verifyRoot_1(ObjectMapper mapper, Dataset pubs) { @@ -745,21 +752,23 @@ public class SparkDedupTest implements Serializable { .distinct() .count(); - assertEquals(925, publications); - assertEquals(839, organizations); - assertEquals(100, projects); - assertEquals(100, datasource); - assertEquals(196, softwares); - assertEquals(389, dataset); - assertEquals(520, otherresearchproduct); + System.out.println("publications = " + publications); + System.out.println("organizations = " + organizations); + System.out.println("projects = " + projects); + System.out.println("datasource = " + datasource); + System.out.println("software = " + softwares); + System.out.println("dataset = " + dataset); + System.out.println("otherresearchproduct = " + otherresearchproduct); -// System.out.println("publications = " + publications); -// System.out.println("organizations = " + organizations); -// System.out.println("projects = " + projects); -// System.out.println("datasource = " + datasource); -// System.out.println("software = " + softwares); -// System.out.println("dataset = " + dataset); -// System.out.println("otherresearchproduct = " + otherresearchproduct); + if (CHECK_CARDINALITIES) { + assertEquals(930, publications); + assertEquals(839, organizations); + assertEquals(100, projects); + assertEquals(100, datasource); + assertEquals(196, softwares); + assertEquals(389, dataset); + assertEquals(520, otherresearchproduct); + } long deletedOrgs = jsc .textFile(testDedupGraphBasePath + "/organization") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala index e2e7fada6..2e23a3a59 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.enrich.orcid import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty} import eu.dnetlib.dhp.schema.sx.OafUtils +import eu.dnetlib.pace.util.AuthorMatchers import java.util import scala.beans.BeanProperty @@ -39,7 +40,7 @@ object ORCIDAuthorEnricher extends Serializable { unmatched_authors, orcid_authors, (author, orcid) => - ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName), + AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName), "fullName" ) ++ // Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName @@ -47,7 +48,7 @@ object ORCIDAuthorEnricher extends Serializable { unmatched_authors, orcid_authors, (author, orcid) => - ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName), + AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName), "reversedFullName" ) ++ // split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations @@ -55,7 +56,7 @@ object ORCIDAuthorEnricher extends Serializable { unmatched_authors, orcid_authors, (author, orcid) => - ORCIDAuthorMatchers + AuthorMatchers .matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName), "orderedTokens" ) ++ @@ -63,7 +64,7 @@ object ORCIDAuthorEnricher extends Serializable { extractAndEnrichMatches( unmatched_authors, orcid_authors, - (author, orcid) => ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName), + (author, orcid) => AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName), "creditName" ) ++ // look after exact matches in ORCID otherNames @@ -71,7 +72,7 @@ object ORCIDAuthorEnricher extends Serializable { unmatched_authors, orcid_authors, (author, orcid) => - orcid.otherNames != null && ORCIDAuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala), + orcid.otherNames != null && AuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala), "otherNames" ) } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala index f109ebe24..f3a5fe77c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala @@ -1,6 +1,6 @@ package eu.dnetlib.dhp.enrich.orcid -import eu.dnetlib.dhp.enrich.orcid.ORCIDAuthorMatchers.matchOrderedTokenAndAbbreviations +import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue} import org.junit.jupiter.api.Test From 78b9d84e4a380699d7316a528eac49bcc7a3dbb9 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 16 Apr 2024 09:41:16 +0200 Subject: [PATCH 7/8] test --- .../plugin/rest/OsfPreprintCollectorTest.java | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java new file mode 100644 index 000000000..2f0263a0d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java @@ -0,0 +1,84 @@ +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.util.HashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.common.collection.HttpClientParams; + +public class OsfPreprintCollectorTest { + + private static final Logger log = LoggerFactory.getLogger(OsfPreprintCollectorTest.class); + + private final String baseUrl = "https://api.osf.io/v2/preprints/"; + + // private final String requestHeaderMap = ""; + // private final String authMethod = ""; + // private final String authToken = ""; + // private final String resultOutputFormat = ""; + + private final String queryParams = "filter:is_published:d=true"; + + private final String entityXpath = "/*/*[local-name()='data']"; + + private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']"; + + private final String resumptionParam = "page"; + private final String resumptionType = "page"; + private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']"; + + private final String resultSizeParam = ""; + private final String resultSizeValue = ""; + + private final String resultFormatParam = "format"; + private final String resultFormatValue = "json"; + + private final ApiDescriptor api = new ApiDescriptor(); + private RestCollectorPlugin rcp; + + @BeforeEach + public void setUp() { + final HashMap params = new HashMap<>(); + params.put("resumptionType", this.resumptionType); + params.put("resumptionParam", this.resumptionParam); + params.put("resumptionXpath", this.resumptionXpath); + params.put("resultTotalXpath", this.resultTotalXpath); + params.put("resultFormatParam", this.resultFormatParam); + params.put("resultFormatValue", this.resultFormatValue); + params.put("resultSizeParam", this.resultSizeParam); + params.put("resultSizeValue", this.resultSizeValue); + params.put("queryParams", this.queryParams); + params.put("entityXpath", this.entityXpath); + + this.api.setBaseUrl(this.baseUrl); + this.api.setParams(params); + + this.rcp = new RestCollectorPlugin(new HttpClientParams()); + } + + @Test + @Disabled + void test() throws CollectorException { + final AtomicInteger i = new AtomicInteger(0); + final Stream stream = this.rcp.collect(this.api, new AggregatorReport()); + + stream.limit(200).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + i.incrementAndGet(); + log.info(s); + }); + + log.info("{}", i.intValue()); + Assertions.assertTrue(i.intValue() > 0); + } +} From d070db4a32c80d7715d5eddccde68341a3ac7314 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 16 Apr 2024 09:41:59 +0200 Subject: [PATCH 8/8] added a couple more invalid author names --- .../eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 9386db933..2be4e8e0c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -92,6 +92,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { INVALID_AUTHOR_NAMES.add("null anonymous"); INVALID_AUTHOR_NAMES.add("unbekannt"); INVALID_AUTHOR_NAMES.add("unknown"); + INVALID_AUTHOR_NAMES.add("autor, Sin"); + INVALID_AUTHOR_NAMES.add("Desconocido / Inconnu,"); INVALID_URL_HOSTS.add("creativecommons.org"); INVALID_URL_HOSTS.add("www.academia.edu");