From 3cad4a415d68c495fe723adadf9b43c46d72e80d Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 11 Apr 2024 15:44:12 +0200
Subject: [PATCH 1/8] fixed duplicated property dhp-schemas.version

---
 pom.xml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 7387f6e3b..8e6f16fe5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -888,7 +888,6 @@
 		<mockito-core.version>3.3.3</mockito-core.version>
 		<mongodb.driver.version>3.4.2</mongodb.driver.version>
 		<vtd.version>[2.12,3.0)</vtd.version>
-		<dhp-schemas.version>[5.17.3]</dhp-schemas.version>
 		<dhp-schemas.version>[6.1.0]</dhp-schemas.version>
 		<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
 		<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>

From abf0b69f29a4473a582ee137bacdabd51fec5a3c Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Thu, 11 Apr 2024 17:12:12 +0300
Subject: [PATCH 2/8] Upgrade the copying operation to Impala Cluster: - Use
 only hive commands in the Ocean Cluster, as the "impala-shell" will be
 removed from there to free-up resources. - Hugely improve the performance in
 every aspect of the copying process: a) speedup file-transferring and
 DB-deletion, b) eliminate permissions-assignment, "load" operations and "use
 $db" queries, c) retry only the "create view" statements and only as long as
 they depend on other non-created views, instead of trying to recreate all
 tables and views 5 consecutive times. - Add error-checks for the creation of
 tables and views.

---
 .../oozie_app/copyDataToImpalaCluster.sh      | 199 +++++++++++++----
 .../oozie_app/copyDataToImpalaCluster.sh      | 197 +++++++++++++----
 .../oozie_app/copyDataToImpalaCluster.sh      | 204 +++++++++++++----
 .../oozie_app/copyDataToImpalaCluster.sh      | 206 +++++++++++++-----
 4 files changed, 623 insertions(+), 183 deletions(-)

diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index 6250aca81..3a8dd8fb6 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -8,9 +8,13 @@ fi
 
 export HADOOP_USER_NAME=$2
 
+
+# Set the active HDFS node of OCEAN and IMPALA cluster.
+OCEAN_HDFS_NODE='hdfs://nameservice1'
+echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+
 IMPALA_HDFS_NODE=''
 COUNTER=0
-
 while [ $COUNTER -lt 3 ]; do
   if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
       IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
@@ -24,71 +28,178 @@ while [ $COUNTER -lt 3 ]; do
   fi
   ((COUNTER++))
 done
-
 if [ -z "$IMPALA_HDFS_NODE" ]; then
-    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
+    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
     exit 1
 fi
 echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
 
+IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
+IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
+
+IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
+
+
+# Set sed arguments.
+LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
+
+# Set the SED command arguments for column-names with reserved words:
+DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
+DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
+DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
+
+HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+
+LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+
 
 function copydb() {
-
-
   db=$1
-  FILE=("hive_wf_tmp_"$RANDOM)
-  hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/
 
-  # change ownership to impala
-#  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
-  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/
+  # Delete the old DB from Impala cluster (if exists).
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+  log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+  if [ -n "$log_errors" ]; then
+    echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    rm -f error.log
+    exit 2
+  fi
 
+  # Make Impala aware of the deletion of the old DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
 
-  # copy the databases from ocean to impala
-  echo "copying $db"
-  hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/
+  echo "Copying $db files from Ocean to Impala cluster.."
+  # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
+  # Using max memory of: 50 * 6144 = 300 Gb
+  # Using 1MB as a buffer-size.
+  # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
+  # The "ug" args cannot be used as we get a "User does not belong to hive" error.
+  # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
+  hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
+                -numListstatusThreads 40 \
+                -copybuffersize 1048576 \
+                -strategy dynamic \
+                -pb \
+                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
-  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
+  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
-  # drop tables from db
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
-    done
+  echo "Creating schema for ${db}"
 
-  # drop views from db
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell  -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
-    done
+  # create the new database (with the same name)
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
 
-  # delete the database
-  impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
+  # Make Impala aware of the creation of the new DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
+  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
 
-  # create the databases
-  impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
+  all_create_view_commands=()
 
-  impala-shell -q "INVALIDATE METADATA"
-  echo "creating schema for ${db}"
-  for ((  k  = 0;  k  < 5;  k ++ )); do
-  for i in `impala-shell -d ${db} --delimited  -q "show tables"`;
-    do
-      impala-shell -d ${db} --delimited  -q "show create table $i";
-    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
+  for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs.
+    # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
+    create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
+
+    create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_command_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
+      create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
+        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
+        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      all_create_view_commands+=("$create_view_command")
+    else
+      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
+      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
+      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
+          echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+      else
+        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
+        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+        if [ -n "$log_errors" ]; then
+          echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+        fi
+      fi
+    fi
   done
 
-  # load the data from /tmp in the respective tables
-  echo "copying data in tables and computing stats"
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-      do
-        impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
-        impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
-      done
+  echo -e "\nAll tables have been created, going to create the views..\n"
 
-  # deleting the remaining directory from hdfs
-hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
+  # Make Impala aware of the new tables.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  # Time to loop through the views and create them.
+  # At this point all table-schemas should have been created.
+  echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n"  # DEBUG
+
+  should_retry=1  # Should retry creating the views (in case their tables where not created before them).
+  # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry..
+
+  previous_num_of_views_to_retry=${#all_create_view_commands}
+
+  while ((should_retry)); do
+
+    # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
+    # In this case, we should retry creating this particular view again.
+    should_retry=0  # We should NOT do another iteration, unless at least one view could NOT be created.
+
+    should_retry_create_view_commands=()
+
+    for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+      specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
+      echo -e "\nspecific_errors: ${specific_errors}\n"
+      if [ -n "$specific_errors" ]; then
+        echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
+        should_retry=1
+        should_retry_create_view_commands+=("$create_view_command")
+      else
+          sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
+      fi
+    done
+
+    echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
+
+    new_num_of_views_to_retry=${#should_retry_create_view_commands}
+    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
+      echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
+      exit 3
+    else
+      previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    fi
+
+    all_create_view_commands=$should_retry_create_view_command
+  done
+
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  echo "Computing stats for tables.."
+  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do
+    # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
+    create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
+    fi
+  done
+
+  rm -f error.log
+
+  echo -e "\n\nFinished processing db: ${db}\n\n"
 }
 
+
 MONITOR_DB=$1
 #HADOOP_USER_NAME=$2
 copydb $MONITOR_DB
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 97fa0dd9c..4ff2b746d 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -8,9 +8,12 @@ fi
 
 export HADOOP_USER_NAME=$2
 
+# Set the active HDFS node of OCEAN and IMPALA cluster.
+OCEAN_HDFS_NODE='hdfs://nameservice1'
+echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+
 IMPALA_HDFS_NODE=''
 COUNTER=0
-
 while [ $COUNTER -lt 3 ]; do
   if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
       IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
@@ -24,70 +27,178 @@ while [ $COUNTER -lt 3 ]; do
   fi
   ((COUNTER++))
 done
-
 if [ -z "$IMPALA_HDFS_NODE" ]; then
-    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
+    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
     exit 1
 fi
 echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
 
+IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
+IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
+
+IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
+
+
+# Set sed arguments.
+LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
+
+# Set the SED command arguments for column-names with reserved words:
+DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
+DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
+DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
+
+HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+
+LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+
 
 function copydb() {
-
   db=$1
-  FILE=("hive_wf_tmp_"$RANDOM)
-  hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/
 
-  # change ownership to impala
-#  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
-  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/
+  # Delete the old DB from Impala cluster (if exists).
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+  log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+  if [ -n "$log_errors" ]; then
+    echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    rm -f error.log
+    exit 2
+  fi
 
+  # Make Impala aware of the deletion of the old DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
 
-  # copy the databases from ocean to impala
-  echo "copying $db"
-  hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/
+  echo "Copying $db files from Ocean to Impala cluster.."
+  # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
+  # Using max memory of: 50 * 6144 = 300 Gb
+  # Using 1MB as a buffer-size.
+  # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
+  # The "ug" args cannot be used as we get a "User does not belong to hive" error.
+  # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
+  hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
+                -numListstatusThreads 40 \
+                -copybuffersize 1048576 \
+                -strategy dynamic \
+                -pb \
+                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
-  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
+  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
-  # drop tables from db
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
-    done
+  echo "Creating schema for ${db}"
 
-  # drop views from db
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell  -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
-    done
+  # create the new database (with the same name)
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
 
-  # delete the database
-  impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
+  # Make Impala aware of the creation of the new DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
+  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
 
-  # create the databases
-  impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
+  all_create_view_commands=()
 
-  impala-shell -q "INVALIDATE METADATA"
-  echo "creating schema for ${db}"
-  for ((  k  = 0;  k  < 5;  k ++ )); do
-  for i in `impala-shell -d ${db} --delimited  -q "show tables"`;
-    do
-      impala-shell -d ${db} --delimited  -q "show create table $i";
-    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
+  for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs.
+    # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
+    create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
+
+    create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_command_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
+      create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
+        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
+        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      all_create_view_commands+=("$create_view_command")
+    else
+      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
+      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
+      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
+          echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+      else
+        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
+        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+        if [ -n "$log_errors" ]; then
+          echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+        fi
+      fi
+    fi
   done
 
-  # load the data from /tmp in the respective tables
-  echo "copying data in tables and computing stats"
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-      do
-        impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
-        impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
-      done
+  echo -e "\nAll tables have been created, going to create the views..\n"
 
-  # deleting the remaining directory from hdfs
-hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
+  # Make Impala aware of the new tables.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  # Time to loop through the views and create them.
+  # At this point all table-schemas should have been created.
+  echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n"  # DEBUG
+
+  should_retry=1  # Should retry creating the views (in case their tables where not created before them).
+  # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry..
+
+  previous_num_of_views_to_retry=${#all_create_view_commands}
+
+  while ((should_retry)); do
+
+    # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
+    # In this case, we should retry creating this particular view again.
+    should_retry=0  # We should NOT do another iteration, unless at least one view could NOT be created.
+
+    should_retry_create_view_commands=()
+
+    for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+      specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
+      echo -e "\nspecific_errors: ${specific_errors}\n"
+      if [ -n "$specific_errors" ]; then
+        echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
+        should_retry=1
+        should_retry_create_view_commands+=("$create_view_command")
+      else
+          sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
+      fi
+    done
+
+    echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
+
+    new_num_of_views_to_retry=${#should_retry_create_view_commands}
+    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
+      echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
+      exit 3
+    else
+      previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    fi
+
+    all_create_view_commands=$should_retry_create_view_command
+  done
+
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  echo "Computing stats for tables.."
+  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do
+    # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
+    create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
+    fi
+  done
+
+  rm -f error.log
+
+  echo -e "\n\nFinished processing db: ${db}\n\n"
 }
 
+
 MONITOR_DB=$1
 #HADOOP_USER_NAME=$2
 copydb $MONITOR_DB
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index 81ac088c0..a16f769e7 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -8,9 +8,12 @@ fi
 
 #export HADOOP_USER_NAME=$2
 
+# Set the active HDFS node of OCEAN and IMPALA cluster.
+OCEAN_HDFS_NODE='hdfs://nameservice1'
+echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+
 IMPALA_HDFS_NODE=''
 COUNTER=0
-
 while [ $COUNTER -lt 3 ]; do
   if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
       IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
@@ -24,73 +27,182 @@ while [ $COUNTER -lt 3 ]; do
   fi
   ((COUNTER++))
 done
-
 if [ -z "$IMPALA_HDFS_NODE" ]; then
-    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
+    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
     exit 1
 fi
 echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
 
+IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
+IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
+
+IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
+
+
+# Set sed arguments.
+LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
+
+# Set the SED command arguments for column-names with reserved words:
+DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
+DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
+DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
+
+HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+
+LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+
+
+export HADOOP_USER="dimitris.pierrakos"
+export HADOOP_USER_NAME='dimitris.pierrakos'
+
 
 function copydb() {
-
-  export HADOOP_USER="dimitris.pierrakos"
-  export HADOOP_USER_NAME='dimitris.pierrakos'
-
   db=$1
-  FILE=("hive_wf_tmp_"$RANDOM)
-  hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/φ
 
-  # change ownership to impala
-#  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
-  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/
+  # Delete the old DB from Impala cluster (if exists).
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+  log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+  if [ -n "$log_errors" ]; then
+    echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    rm -f error.log
+    exit 2
+  fi
 
+  # Make Impala aware of the deletion of the old DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
 
-  # copy the databases from ocean to impala
-  echo "copying $db"
-  hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/
+  echo "Copying $db files from Ocean to Impala cluster.."
+  # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
+  # Using max memory of: 50 * 6144 = 300 Gb
+  # Using 1MB as a buffer-size.
+  # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
+  # The "ug" args cannot be used as we get a "User does not belong to hive" error.
+  # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
+  hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
+                -numListstatusThreads 40 \
+                -copybuffersize 1048576 \
+                -strategy dynamic \
+                -pb \
+                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
-  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
+  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
-  # drop tables from db
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
-    done
+  echo "Creating schema for ${db}"
 
-  # drop views from db
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell  -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
-    done
+  # create the new database (with the same name)
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
 
-  # delete the database
-  impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
+  # Make Impala aware of the creation of the new DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
+  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
 
-  # create the databases
-  impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
+  all_create_view_commands=()
 
-  impala-shell -q "INVALIDATE METADATA"
-  echo "creating schema for ${db}"
-  for ((  k  = 0;  k  < 5;  k ++ )); do
-  for i in `impala-shell -d ${db} --delimited  -q "show tables"`;
-    do
-      impala-shell -d ${db} --delimited  -q "show create table $i";
-    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
+  for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs.
+    # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
+    create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
+
+    create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_command_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
+      create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
+        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
+        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      all_create_view_commands+=("$create_view_command")
+    else
+      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
+      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
+      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
+          echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+      else
+        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
+        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+        if [ -n "$log_errors" ]; then
+          echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+        fi
+      fi
+    fi
   done
 
-  # load the data from /tmp in the respective tables
-  echo "copying data in tables and computing stats"
-  for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-      do
-        impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
-        impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
-      done
+  echo -e "\nAll tables have been created, going to create the views..\n"
 
-  # deleting the remaining directory from hdfs
-hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
+  # Make Impala aware of the new tables.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  # Time to loop through the views and create them.
+  # At this point all table-schemas should have been created.
+  echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n"  # DEBUG
+
+  should_retry=1  # Should retry creating the views (in case their tables where not created before them).
+  # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry..
+
+  previous_num_of_views_to_retry=${#all_create_view_commands}
+
+  while ((should_retry)); do
+
+    # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
+    # In this case, we should retry creating this particular view again.
+    should_retry=0  # We should NOT do another iteration, unless at least one view could NOT be created.
+
+    should_retry_create_view_commands=()
+
+    for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+      specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
+      echo -e "\nspecific_errors: ${specific_errors}\n"
+      if [ -n "$specific_errors" ]; then
+        echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
+        should_retry=1
+        should_retry_create_view_commands+=("$create_view_command")
+      else
+          sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
+      fi
+    done
+
+    echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
+
+    new_num_of_views_to_retry=${#should_retry_create_view_commands}
+    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
+      echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
+      exit 3
+    else
+      previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    fi
+
+    all_create_view_commands=$should_retry_create_view_command
+  done
+
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  echo "Computing stats for tables.."
+  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do
+    # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
+    create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
+    fi
+  done
+
+  rm -f error.log
+
+  echo -e "\n\nFinished processing db: ${db}\n\n"
 }
 
+
 MONITOR_DB=$1
 #HADOOP_USER_NAME=$2
 
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index 3f8447b6c..0f248a79f 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -6,9 +6,13 @@ then
     ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
 fi
 
+
+# Set the active HDFS node of OCEAN and IMPALA cluster.
+OCEAN_HDFS_NODE='hdfs://nameservice1'
+echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+
 IMPALA_HDFS_NODE=''
 COUNTER=0
-
 while [ $COUNTER -lt 3 ]; do
   if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
       IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
@@ -22,76 +26,178 @@ while [ $COUNTER -lt 3 ]; do
   fi
   ((COUNTER++))
 done
-
 if [ -z "$IMPALA_HDFS_NODE" ]; then
-    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
+    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
     exit 1
 fi
 echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
 
+IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
+IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
+
+IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
+
+# Set sed arguments.
+LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
+
+# Set the SED command arguments for column-names with reserved words:
+DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
+DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g'  # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
+DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
+
+HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
+HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
+HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
+
+LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
+LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
+LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+
 
 export HADOOP_USER_NAME=$6
 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
+
+
 function copydb() {
   db=$1
-  FILE=("hive_wf_tmp_"$RANDOM)
-  hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/$FILE/
-  # copy the databases from ocean to impala
 
-  echo "copying $db"
-  hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/$FILE/
+  # Delete the old DB from Impala cluster (if exists).
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+  log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+  if [ -n "$log_errors" ]; then
+    echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    rm -f error.log
+    exit 2
+  fi
 
-  # change ownership to impala
-  hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
+  # Make Impala aware of the deletion of the old DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
 
-  # drop tables from db
-  for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell  -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
-    done
+  echo "Copying $db files from Ocean to Impala cluster.."
+  # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
+  # Using max memory of: 50 * 6144 = 300 Gb
+  # Using 1MB as a buffer-size.
+  # The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
+  # The "ug" args cannot be used as we get a "User does not belong to hive" error.
+  # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
+  hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
+                -numListstatusThreads 40 \
+                -copybuffersize 1048576 \
+                -strategy dynamic \
+                -pb \
+                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
-  # drop views from db
-  for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-    do
-        `impala-shell  -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
-    done
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
+  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
-  # delete the database
-  impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
+  echo "Creating schema for ${db}"
 
-  # create the databases
-  impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
+  # create the new database (with the same name)
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
 
-  impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
-  echo "creating schema for ${db}"
-  for ((  k  = 0;  k  < 5;  k ++ )); do
-  for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show tables"`;
-    do
-      impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show create table $i";
-    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
+  # Make Impala aware of the creation of the new DB immediately.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
+  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
+
+  all_create_view_commands=()
+
+  for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs.
+    # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
+    create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
+
+    create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_command_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
+      create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+        | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
+        | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
+        | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+      all_create_view_commands+=("$create_view_command")
+    else
+      echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
+      CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
+      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
+          echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+      else
+        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
+        log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
+        if [ -n "$log_errors" ]; then
+          echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+        fi
+      fi
+    fi
   done
 
-#  for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show tables"`;
-#    do
-#      impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show create table $i";
-#    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
-#
-#  # run the same command twice because we may have failures in the first run (due to views pointing to the same db)
-#  for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show tables"`;
-#    do
-#      impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited  -q "show create table $i";
-#    done |  sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
+  echo -e "\nAll tables have been created, going to create the views..\n"
 
-  # load the data from /tmp in the respective tables
-  echo "copying data in tables and computing stats"
-  for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited  -q "show tables"`;
-      do
-        impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
-        impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
-      done
+  # Make Impala aware of the new tables.
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
 
-  # deleting the remaining directory from hdfs
-hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
+  # Time to loop through the views and create them.
+  # At this point all table-schemas should have been created.
+  echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n"  # DEBUG
+
+  should_retry=1  # Should retry creating the views (in case their tables where not created before them).
+  # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry..
+
+  previous_num_of_views_to_retry=${#all_create_view_commands}
+
+  while ((should_retry)); do
+
+    # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
+    # In this case, we should retry creating this particular view again.
+    should_retry=0  # We should NOT do another iteration, unless at least one view could NOT be created.
+
+    should_retry_create_view_commands=()
+
+    for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+      specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
+      echo -e "\nspecific_errors: ${specific_errors}\n"
+      if [ -n "$specific_errors" ]; then
+        echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
+        should_retry=1
+        should_retry_create_view_commands+=("$create_view_command")
+      else
+          sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
+      fi
+    done
+
+    echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
+
+    new_num_of_views_to_retry=${#should_retry_create_view_commands}
+    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
+      echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
+      exit 3
+    else
+      previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    fi
+
+    all_create_view_commands=$should_retry_create_view_command
+  done
+
+  sleep 1
+  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+  sleep 1
+
+  echo "Computing stats for tables.."
+  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do
+    # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
+    create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
+    fi
+  done
+
+  rm -f error.log
+
+  echo -e "\n\nFinished processing db: ${db}\n\n"
 }
 
 STATS_DB=$1

From 22745027c8f900be793b342c70427adf8c959c91 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Thu, 11 Apr 2024 17:46:33 +0300
Subject: [PATCH 3/8] Use the "HADOOP_USER_NAME" value from the
 "workflow-property", in "copyDataToImpalaCluster.sh", in
 "stats-monitor-updates".

---
 .../stats-monitor/oozie_app/copyDataToImpalaCluster.sh     | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index a16f769e7..82c38bb65 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -6,7 +6,7 @@ then
     ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
 fi
 
-#export HADOOP_USER_NAME=$2
+export HADOOP_USER_NAME=$2
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
@@ -56,10 +56,6 @@ LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
 LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
 
 
-export HADOOP_USER="dimitris.pierrakos"
-export HADOOP_USER_NAME='dimitris.pierrakos'
-
-
 function copydb() {
   db=$1
 
@@ -204,7 +200,6 @@ function copydb() {
 
 
 MONITOR_DB=$1
-#HADOOP_USER_NAME=$2
 
 copydb $MONITOR_DB'_institutions'
 copydb $MONITOR_DB

From 14719dcd6202233dd076fc377094e7d48cdc1d22 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Fri, 12 Apr 2024 15:36:13 +0300
Subject: [PATCH 4/8] Miscellaneous updates to the copying operation to Impala
 Cluster: - Update the algorithm for creating views that depend on other
 views. - Add check for successful execution of the "hadoop distcp" command. -
 Add a check for successful copy operation of all entities. - Upon facing an
 error in a DB, exit the method, instead of the whole script. - Improve
 logging. - Code polishing.

---
 .../oozie_app/copyDataToImpalaCluster.sh      | 60 ++++++++++++-------
 .../oozie_app/copyDataToImpalaCluster.sh      | 60 ++++++++++++-------
 .../oozie_app/copyDataToImpalaCluster.sh      | 60 ++++++++++++-------
 .../oozie_app/copyDataToImpalaCluster.sh      | 60 ++++++++++++-------
 4 files changed, 160 insertions(+), 80 deletions(-)

diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index 3a8dd8fb6..fceb1b76b 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -66,7 +66,7 @@ function copydb() {
   if [ -n "$log_errors" ]; then
     echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
-    exit 2
+    return 1
   fi
 
   # Make Impala aware of the deletion of the old DB immediately.
@@ -87,6 +87,15 @@ function copydb() {
                 -pb \
                 ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
+  # Check the exit status of the "hadoop distcp" command.
+  if [ $? -eq 0 ]; then
+    echo "Successfully copied the files of '${db}'."
+  else
+    echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?"
+    rm -f error.log
+    return 2
+  fi
+
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
   #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
@@ -104,7 +113,8 @@ function copydb() {
 
   all_create_view_commands=()
 
-  for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs.
+  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
     create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
 
@@ -140,45 +150,44 @@ function copydb() {
 
   # Time to loop through the views and create them.
   # At this point all table-schemas should have been created.
-  echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n"  # DEBUG
-
-  should_retry=1  # Should retry creating the views (in case their tables where not created before them).
-  # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry..
 
   previous_num_of_views_to_retry=${#all_create_view_commands}
+  if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
+    echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n"  # DEBUG
+  else
+    echo -e "\nDB '${db}' does not contain views.\n"
+  fi
 
-  while ((should_retry)); do
-
+  level_counter=0
+  while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do
+    ((level_counter++))
     # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
     # In this case, we should retry creating this particular view again.
-    should_retry=0  # We should NOT do another iteration, unless at least one view could NOT be created.
-
     should_retry_create_view_commands=()
 
-    for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs.
+    for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
       impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
       specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
-      echo -e "\nspecific_errors: ${specific_errors}\n"
       if [ -n "$specific_errors" ]; then
+        echo -e "\nspecific_errors: ${specific_errors}\n"
         echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
-        should_retry=1
         should_retry_create_view_commands+=("$create_view_command")
       else
           sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
       fi
     done
 
-    echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
-
     new_num_of_views_to_retry=${#should_retry_create_view_commands}
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
-      exit 3
-    else
+      return 3
+    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
+      echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
       previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    else
+      echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n"
     fi
-
-    all_create_view_commands=$should_retry_create_view_command
+    all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
   done
 
   sleep 1
@@ -186,7 +195,10 @@ function copydb() {
   sleep 1
 
   echo "Computing stats for tables.."
-  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do
+
+  entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
+
+  for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
     create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
     if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
@@ -194,6 +206,14 @@ function copydb() {
     fi
   done
 
+  if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
+    echo -e "\nAll entities have been copied to Impala cluster.\n"
+  else
+    echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    rm -f error.log
+    return 4
+  fi
+
   rm -f error.log
 
   echo -e "\n\nFinished processing db: ${db}\n\n"
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 4ff2b746d..7ff6a5d52 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -65,7 +65,7 @@ function copydb() {
   if [ -n "$log_errors" ]; then
     echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
-    exit 2
+    return 1
   fi
 
   # Make Impala aware of the deletion of the old DB immediately.
@@ -86,6 +86,15 @@ function copydb() {
                 -pb \
                 ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
+  # Check the exit status of the "hadoop distcp" command.
+  if [ $? -eq 0 ]; then
+    echo "Successfully copied the files of '${db}'."
+  else
+    echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?"
+    rm -f error.log
+    return 2
+  fi
+
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
   #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
@@ -103,7 +112,8 @@ function copydb() {
 
   all_create_view_commands=()
 
-  for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs.
+  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
     create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
 
@@ -139,45 +149,44 @@ function copydb() {
 
   # Time to loop through the views and create them.
   # At this point all table-schemas should have been created.
-  echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n"  # DEBUG
-
-  should_retry=1  # Should retry creating the views (in case their tables where not created before them).
-  # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry..
 
   previous_num_of_views_to_retry=${#all_create_view_commands}
+  if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
+    echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n"  # DEBUG
+  else
+    echo -e "\nDB '${db}' does not contain views.\n"
+  fi
 
-  while ((should_retry)); do
-
+  level_counter=0
+  while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do
+    ((level_counter++))
     # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
     # In this case, we should retry creating this particular view again.
-    should_retry=0  # We should NOT do another iteration, unless at least one view could NOT be created.
-
     should_retry_create_view_commands=()
 
-    for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs.
+    for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
       impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
       specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
-      echo -e "\nspecific_errors: ${specific_errors}\n"
       if [ -n "$specific_errors" ]; then
+        echo -e "\nspecific_errors: ${specific_errors}\n"
         echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
-        should_retry=1
         should_retry_create_view_commands+=("$create_view_command")
       else
           sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
       fi
     done
 
-    echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
-
     new_num_of_views_to_retry=${#should_retry_create_view_commands}
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
-      exit 3
-    else
+      return 3
+    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
+      echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
       previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    else
+      echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n"
     fi
-
-    all_create_view_commands=$should_retry_create_view_command
+    all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
   done
 
   sleep 1
@@ -185,7 +194,10 @@ function copydb() {
   sleep 1
 
   echo "Computing stats for tables.."
-  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do
+
+  entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
+
+  for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
     create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
     if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
@@ -193,6 +205,14 @@ function copydb() {
     fi
   done
 
+  if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
+    echo -e "\nAll entities have been copied to Impala cluster.\n"
+  else
+    echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    rm -f error.log
+    return 4
+  fi
+
   rm -f error.log
 
   echo -e "\n\nFinished processing db: ${db}\n\n"
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index 82c38bb65..8900adcb5 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -65,7 +65,7 @@ function copydb() {
   if [ -n "$log_errors" ]; then
     echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
-    exit 2
+    return 1
   fi
 
   # Make Impala aware of the deletion of the old DB immediately.
@@ -86,6 +86,15 @@ function copydb() {
                 -pb \
                 ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
+  # Check the exit status of the "hadoop distcp" command.
+  if [ $? -eq 0 ]; then
+    echo "Successfully copied the files of '${db}'."
+  else
+    echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?"
+    rm -f error.log
+    return 2
+  fi
+
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
   #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
@@ -103,7 +112,8 @@ function copydb() {
 
   all_create_view_commands=()
 
-  for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs.
+  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
     create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
 
@@ -139,45 +149,44 @@ function copydb() {
 
   # Time to loop through the views and create them.
   # At this point all table-schemas should have been created.
-  echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n"  # DEBUG
-
-  should_retry=1  # Should retry creating the views (in case their tables where not created before them).
-  # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry..
 
   previous_num_of_views_to_retry=${#all_create_view_commands}
+  if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
+    echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n"  # DEBUG
+  else
+    echo -e "\nDB '${db}' does not contain views.\n"
+  fi
 
-  while ((should_retry)); do
-
+  level_counter=0
+  while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do
+    ((level_counter++))
     # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
     # In this case, we should retry creating this particular view again.
-    should_retry=0  # We should NOT do another iteration, unless at least one view could NOT be created.
-
     should_retry_create_view_commands=()
 
-    for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs.
+    for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
       impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
       specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
-      echo -e "\nspecific_errors: ${specific_errors}\n"
       if [ -n "$specific_errors" ]; then
+        echo -e "\nspecific_errors: ${specific_errors}\n"
         echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
-        should_retry=1
         should_retry_create_view_commands+=("$create_view_command")
       else
           sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
       fi
     done
 
-    echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
-
     new_num_of_views_to_retry=${#should_retry_create_view_commands}
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
-      exit 3
-    else
+      return 3
+    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
+      echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
       previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    else
+      echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n"
     fi
-
-    all_create_view_commands=$should_retry_create_view_command
+    all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
   done
 
   sleep 1
@@ -185,7 +194,10 @@ function copydb() {
   sleep 1
 
   echo "Computing stats for tables.."
-  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do
+
+  entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
+
+  for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
     create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
     if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
@@ -193,6 +205,14 @@ function copydb() {
     fi
   done
 
+  if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
+    echo -e "\nAll entities have been copied to Impala cluster.\n"
+  else
+    echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    rm -f error.log
+    return 4
+  fi
+
   rm -f error.log
 
   echo -e "\n\nFinished processing db: ${db}\n\n"
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index 0f248a79f..bbb5e43ee 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -67,7 +67,7 @@ function copydb() {
   if [ -n "$log_errors" ]; then
     echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
-    exit 2
+    return 1
   fi
 
   # Make Impala aware of the deletion of the old DB immediately.
@@ -88,6 +88,15 @@ function copydb() {
                 -pb \
                 ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
 
+  # Check the exit status of the "hadoop distcp" command.
+  if [ $? -eq 0 ]; then
+    echo "Successfully copied the files of '${db}'."
+  else
+    echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?"
+    rm -f error.log
+    return 2
+  fi
+
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
   #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
@@ -105,7 +114,8 @@ function copydb() {
 
   all_create_view_commands=()
 
-  for i in `hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`; do # Get the tables and views without any potential the "WARN" logs.
+  entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
+  for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
     create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
 
@@ -141,45 +151,44 @@ function copydb() {
 
   # Time to loop through the views and create them.
   # At this point all table-schemas should have been created.
-  echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n\n"  # DEBUG
-
-  should_retry=1  # Should retry creating the views (in case their tables where not created before them).
-  # There are views of other views as well, so we may have 3,4,5 nested-view and need to retry..
 
   previous_num_of_views_to_retry=${#all_create_view_commands}
+  if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
+    echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n"  # DEBUG
+  else
+    echo -e "\nDB '${db}' does not contain views.\n"
+  fi
 
-  while ((should_retry)); do
-
+  level_counter=0
+  while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do
+    ((level_counter++))
     # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
     # In this case, we should retry creating this particular view again.
-    should_retry=0  # We should NOT do another iteration, unless at least one view could NOT be created.
-
     should_retry_create_view_commands=()
 
-    for create_view_command in "${all_create_view_commands[@]}"; do # Get the tables and views without any potential the "WARN" logs.
+    for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
       impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
       specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
-      echo -e "\nspecific_errors: ${specific_errors}\n"
       if [ -n "$specific_errors" ]; then
+        echo -e "\nspecific_errors: ${specific_errors}\n"
         echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
-        should_retry=1
         should_retry_create_view_commands+=("$create_view_command")
       else
           sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
       fi
     done
 
-    echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
-
     new_num_of_views_to_retry=${#should_retry_create_view_commands}
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
       echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
-      exit 3
-    else
+      return 3
+    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
+      echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
       previous_num_of_views_to_retry=$new_num_of_views_to_retry
+    else
+      echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n"
     fi
-
-    all_create_view_commands=$should_retry_create_view_command
+    all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
   done
 
   sleep 1
@@ -187,7 +196,10 @@ function copydb() {
   sleep 1
 
   echo "Computing stats for tables.."
-  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`; do
+
+  entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
+
+  for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
     create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
     if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
@@ -195,6 +207,14 @@ function copydb() {
     fi
   done
 
+  if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
+    echo -e "\nAll entities have been copied to Impala cluster.\n"
+  else
+    echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    rm -f error.log
+    return 4
+  fi
+
   rm -f error.log
 
   echo -e "\n\nFinished processing db: ${db}\n\n"

From d7da4f814ba17f71e09c047c0de8cbcd90c481b8 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Fri, 12 Apr 2024 18:12:06 +0300
Subject: [PATCH 5/8] Minor updates to the copying operation to Impala Cluster:
 - Improve logging. - Code optimization/polishing.

---
 .../oozie_app/copyDataToImpalaCluster.sh      | 83 +++++++++----------
 .../oozie_app/copyDataToImpalaCluster.sh      | 83 +++++++++----------
 .../oozie_app/copyDataToImpalaCluster.sh      | 83 +++++++++----------
 .../oozie_app/copyDataToImpalaCluster.sh      | 83 +++++++++----------
 4 files changed, 160 insertions(+), 172 deletions(-)

diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index fceb1b76b..3d9986b64 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -11,7 +11,7 @@ export HADOOP_USER_NAME=$2
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
-echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
 
 IMPALA_HDFS_NODE=''
 COUNTER=0
@@ -29,10 +29,10 @@ while [ $COUNTER -lt 3 ]; do
   ((COUNTER++))
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
-    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
+    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
     exit 1
 fi
-echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
+echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
 
 IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
 IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
@@ -59,12 +59,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
 
 function copydb() {
   db=$1
+  echo -e "\nStart processing db: '${db}'..\n"
 
   # Delete the old DB from Impala cluster (if exists).
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
   log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
   if [ -n "$log_errors" ]; then
-    echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
     return 1
   fi
@@ -73,7 +74,7 @@ function copydb() {
   sleep 1
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
 
-  echo "Copying $db files from Ocean to Impala cluster.."
+  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
   # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
   # Using max memory of: 50 * 6144 = 300 Gb
   # Using 1MB as a buffer-size.
@@ -89,9 +90,9 @@ function copydb() {
 
   # Check the exit status of the "hadoop distcp" command.
   if [ $? -eq 0 ]; then
-    echo "Successfully copied the files of '${db}'."
+    echo -e "\nSuccessfully copied the files of '${db}'.\n"
   else
-    echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?"
+    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
     rm -f error.log
     return 2
   fi
@@ -99,7 +100,7 @@ function copydb() {
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
   #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
-  echo "Creating schema for ${db}"
+  echo -e "\nCreating schema for db: '${db}'\n"
 
   # create the new database (with the same name)
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
@@ -111,31 +112,31 @@ function copydb() {
   # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
   # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
 
-  all_create_view_commands=()
+  all_create_view_statements=()
 
   entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
   for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
-    # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
-    create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
+    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
+    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
 
-    create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'`
-    if [ -n "$create_view_command_test" ]; then
-      echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
-      create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_statement_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
         | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
         | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
         | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
-      all_create_view_commands+=("$create_view_command")
+      all_create_view_statements+=("$create_view_statement")
     else
       echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
       CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
-          echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
         log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
         if [ -n "$log_errors" ]; then
-          echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
         fi
       fi
     fi
@@ -143,65 +144,62 @@ function copydb() {
 
   echo -e "\nAll tables have been created, going to create the views..\n"
 
-  # Make Impala aware of the new tables.
-  sleep 1
-  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
-  sleep 1
-
   # Time to loop through the views and create them.
   # At this point all table-schemas should have been created.
 
-  previous_num_of_views_to_retry=${#all_create_view_commands}
+  previous_num_of_views_to_retry=${#all_create_view_statements}
   if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
-    echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n"  # DEBUG
+    echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n"  # DEBUG
+    # Make Impala aware of the new tables, so it knows them when creating the views.
+    sleep 1
+    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+    sleep 1
   else
-    echo -e "\nDB '${db}' does not contain views.\n"
+    echo -e "\nDB '${db}' does not contain any views.\n"
   fi
 
   level_counter=0
-  while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do
+  while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
     ((level_counter++))
     # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
     # In this case, we should retry creating this particular view again.
-    should_retry_create_view_commands=()
+    should_retry_create_view_statements=()
 
-    for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
-      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+    for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
       specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
       if [ -n "$specific_errors" ]; then
         echo -e "\nspecific_errors: ${specific_errors}\n"
         echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
-        should_retry_create_view_commands+=("$create_view_command")
+        should_retry_create_view_statements+=("$create_view_statement")
       else
           sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
       fi
     done
 
-    new_num_of_views_to_retry=${#should_retry_create_view_commands}
+    new_num_of_views_to_retry=${#should_retry_create_view_statements}
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
-      echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
+      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
       return 3
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
-      echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
+      echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
       previous_num_of_views_to_retry=$new_num_of_views_to_retry
     else
-      echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n"
+      echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
     fi
-    all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
+    all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
   done
 
   sleep 1
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
   sleep 1
 
-  echo "Computing stats for tables.."
-
+  echo -e "\nComputing stats for tables..\n"
   entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
-
   for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
-    create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
-    if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
+    create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_statement" ]; then  # If it's a table, then go load the data to it.
       impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
     fi
   done
@@ -209,13 +207,12 @@ function copydb() {
   if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
     echo -e "\nAll entities have been copied to Impala cluster.\n"
   else
-    echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
     return 4
   fi
 
   rm -f error.log
-
   echo -e "\n\nFinished processing db: ${db}\n\n"
 }
 
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 7ff6a5d52..2711d6e12 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -10,7 +10,7 @@ export HADOOP_USER_NAME=$2
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
-echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
 
 IMPALA_HDFS_NODE=''
 COUNTER=0
@@ -28,10 +28,10 @@ while [ $COUNTER -lt 3 ]; do
   ((COUNTER++))
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
-    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
+    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
     exit 1
 fi
-echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
+echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
 
 IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
 IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
@@ -58,12 +58,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
 
 function copydb() {
   db=$1
+  echo -e "\nStart processing db: '${db}'..\n"
 
   # Delete the old DB from Impala cluster (if exists).
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
   log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
   if [ -n "$log_errors" ]; then
-    echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
     return 1
   fi
@@ -72,7 +73,7 @@ function copydb() {
   sleep 1
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
 
-  echo "Copying $db files from Ocean to Impala cluster.."
+  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
   # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
   # Using max memory of: 50 * 6144 = 300 Gb
   # Using 1MB as a buffer-size.
@@ -88,9 +89,9 @@ function copydb() {
 
   # Check the exit status of the "hadoop distcp" command.
   if [ $? -eq 0 ]; then
-    echo "Successfully copied the files of '${db}'."
+    echo -e "\nSuccessfully copied the files of '${db}'.\n"
   else
-    echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?"
+    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
     rm -f error.log
     return 2
   fi
@@ -98,7 +99,7 @@ function copydb() {
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
   #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
-  echo "Creating schema for ${db}"
+  echo -e "\nCreating schema for db: '${db}'\n"
 
   # create the new database (with the same name)
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
@@ -110,31 +111,31 @@ function copydb() {
   # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
   # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
 
-  all_create_view_commands=()
+  all_create_view_statements=()
 
   entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
   for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
-    # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
-    create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
+    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
+    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
 
-    create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'`
-    if [ -n "$create_view_command_test" ]; then
-      echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
-      create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_statement_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
         | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
         | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
         | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
-      all_create_view_commands+=("$create_view_command")
+      all_create_view_statements+=("$create_view_statement")
     else
       echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
       CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
-          echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
         log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
         if [ -n "$log_errors" ]; then
-          echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
         fi
       fi
     fi
@@ -142,65 +143,62 @@ function copydb() {
 
   echo -e "\nAll tables have been created, going to create the views..\n"
 
-  # Make Impala aware of the new tables.
-  sleep 1
-  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
-  sleep 1
-
   # Time to loop through the views and create them.
   # At this point all table-schemas should have been created.
 
-  previous_num_of_views_to_retry=${#all_create_view_commands}
+  previous_num_of_views_to_retry=${#all_create_view_statements}
   if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
-    echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n"  # DEBUG
+    echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n"  # DEBUG
+    # Make Impala aware of the new tables, so it knows them when creating the views.
+    sleep 1
+    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+    sleep 1
   else
-    echo -e "\nDB '${db}' does not contain views.\n"
+    echo -e "\nDB '${db}' does not contain any views.\n"
   fi
 
   level_counter=0
-  while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do
+  while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
     ((level_counter++))
     # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
     # In this case, we should retry creating this particular view again.
-    should_retry_create_view_commands=()
+    should_retry_create_view_statements=()
 
-    for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
-      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+    for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
       specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
       if [ -n "$specific_errors" ]; then
         echo -e "\nspecific_errors: ${specific_errors}\n"
         echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
-        should_retry_create_view_commands+=("$create_view_command")
+        should_retry_create_view_statements+=("$create_view_statement")
       else
           sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
       fi
     done
 
-    new_num_of_views_to_retry=${#should_retry_create_view_commands}
+    new_num_of_views_to_retry=${#should_retry_create_view_statements}
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
-      echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
+      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
       return 3
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
-      echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
+      echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
       previous_num_of_views_to_retry=$new_num_of_views_to_retry
     else
-      echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n"
+      echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
     fi
-    all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
+    all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
   done
 
   sleep 1
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
   sleep 1
 
-  echo "Computing stats for tables.."
-
+  echo -e "\nComputing stats for tables..\n"
   entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
-
   for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
-    create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
-    if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
+    create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_statement" ]; then  # If it's a table, then go load the data to it.
       impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
     fi
   done
@@ -208,13 +206,12 @@ function copydb() {
   if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
     echo -e "\nAll entities have been copied to Impala cluster.\n"
   else
-    echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
     return 4
   fi
 
   rm -f error.log
-
   echo -e "\n\nFinished processing db: ${db}\n\n"
 }
 
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index 8900adcb5..5ad9df762 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -10,7 +10,7 @@ export HADOOP_USER_NAME=$2
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
-echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
 
 IMPALA_HDFS_NODE=''
 COUNTER=0
@@ -28,10 +28,10 @@ while [ $COUNTER -lt 3 ]; do
   ((COUNTER++))
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
-    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
+    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
     exit 1
 fi
-echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
+echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
 
 IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
 IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
@@ -58,12 +58,13 @@ LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
 
 function copydb() {
   db=$1
+  echo -e "\nStart processing db: '${db}'..\n"
 
   # Delete the old DB from Impala cluster (if exists).
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
   log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
   if [ -n "$log_errors" ]; then
-    echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
     return 1
   fi
@@ -72,7 +73,7 @@ function copydb() {
   sleep 1
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
 
-  echo "Copying $db files from Ocean to Impala cluster.."
+  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
   # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
   # Using max memory of: 50 * 6144 = 300 Gb
   # Using 1MB as a buffer-size.
@@ -88,9 +89,9 @@ function copydb() {
 
   # Check the exit status of the "hadoop distcp" command.
   if [ $? -eq 0 ]; then
-    echo "Successfully copied the files of '${db}'."
+    echo -e "\nSuccessfully copied the files of '${db}'.\n"
   else
-    echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?"
+    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
     rm -f error.log
     return 2
   fi
@@ -98,7 +99,7 @@ function copydb() {
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
   #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
-  echo "Creating schema for ${db}"
+  echo -e "\nCreating schema for db: '${db}'\n"
 
   # create the new database (with the same name)
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
@@ -110,31 +111,31 @@ function copydb() {
   # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
   # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
 
-  all_create_view_commands=()
+  all_create_view_statements=()
 
   entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
   for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
-    # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
-    create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
+    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
+    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
 
-    create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'`
-    if [ -n "$create_view_command_test" ]; then
-      echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
-      create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_statement_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
         | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
         | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
         | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
-      all_create_view_commands+=("$create_view_command")
+      all_create_view_statements+=("$create_view_statement")
     else
       echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
       CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
-          echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
         log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
         if [ -n "$log_errors" ]; then
-          echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
         fi
       fi
     fi
@@ -142,65 +143,62 @@ function copydb() {
 
   echo -e "\nAll tables have been created, going to create the views..\n"
 
-  # Make Impala aware of the new tables.
-  sleep 1
-  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
-  sleep 1
-
   # Time to loop through the views and create them.
   # At this point all table-schemas should have been created.
 
-  previous_num_of_views_to_retry=${#all_create_view_commands}
+  previous_num_of_views_to_retry=${#all_create_view_statements}
   if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
-    echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n"  # DEBUG
+    echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n"  # DEBUG
+    # Make Impala aware of the new tables, so it knows them when creating the views.
+    sleep 1
+    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+    sleep 1
   else
-    echo -e "\nDB '${db}' does not contain views.\n"
+    echo -e "\nDB '${db}' does not contain any views.\n"
   fi
 
   level_counter=0
-  while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do
+  while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
     ((level_counter++))
     # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
     # In this case, we should retry creating this particular view again.
-    should_retry_create_view_commands=()
+    should_retry_create_view_statements=()
 
-    for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
-      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+    for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
       specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
       if [ -n "$specific_errors" ]; then
         echo -e "\nspecific_errors: ${specific_errors}\n"
         echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
-        should_retry_create_view_commands+=("$create_view_command")
+        should_retry_create_view_statements+=("$create_view_statement")
       else
           sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
       fi
     done
 
-    new_num_of_views_to_retry=${#should_retry_create_view_commands}
+    new_num_of_views_to_retry=${#should_retry_create_view_statements}
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
-      echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
+      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
       return 3
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
-      echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
+      echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
       previous_num_of_views_to_retry=$new_num_of_views_to_retry
     else
-      echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n"
+      echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
     fi
-    all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
+    all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
   done
 
   sleep 1
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
   sleep 1
 
-  echo "Computing stats for tables.."
-
+  echo -e "\nComputing stats for tables..\n"
   entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
-
   for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
-    create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
-    if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
+    create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_statement" ]; then  # If it's a table, then go load the data to it.
       impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
     fi
   done
@@ -208,13 +206,12 @@ function copydb() {
   if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
     echo -e "\nAll entities have been copied to Impala cluster.\n"
   else
-    echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
     return 4
   fi
 
   rm -f error.log
-
   echo -e "\n\nFinished processing db: ${db}\n\n"
 }
 
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index bbb5e43ee..c2324b912 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -9,7 +9,7 @@ fi
 
 # Set the active HDFS node of OCEAN and IMPALA cluster.
 OCEAN_HDFS_NODE='hdfs://nameservice1'
-echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
+echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
 
 IMPALA_HDFS_NODE=''
 COUNTER=0
@@ -27,10 +27,10 @@ while [ $COUNTER -lt 3 ]; do
   ((COUNTER++))
 done
 if [ -z "$IMPALA_HDFS_NODE" ]; then
-    echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
+    echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
     exit 1
 fi
-echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
+echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
 
 IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
 IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
@@ -60,12 +60,13 @@ export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
 
 function copydb() {
   db=$1
+  echo -e "\nStart processing db: '${db}'..\n"
 
   # Delete the old DB from Impala cluster (if exists).
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
   log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
   if [ -n "$log_errors" ]; then
-    echo -e "\n\nTHERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
+    echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
     rm -f error.log
     return 1
   fi
@@ -74,7 +75,7 @@ function copydb() {
   sleep 1
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
 
-  echo "Copying $db files from Ocean to Impala cluster.."
+  echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
   # Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
   # Using max memory of: 50 * 6144 = 300 Gb
   # Using 1MB as a buffer-size.
@@ -90,9 +91,9 @@ function copydb() {
 
   # Check the exit status of the "hadoop distcp" command.
   if [ $? -eq 0 ]; then
-    echo "Successfully copied the files of '${db}'."
+    echo -e "\nSuccessfully copied the files of '${db}'.\n"
   else
-    echo "Failed to transfer the files of '${db}', with 'hadoop distcp'. Got with exit status: $?"
+    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
     rm -f error.log
     return 2
   fi
@@ -100,7 +101,7 @@ function copydb() {
   # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
   #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
 
-  echo "Creating schema for ${db}"
+  echo -e "\nCreating schema for db: '${db}'\n"
 
   # create the new database (with the same name)
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
@@ -112,31 +113,31 @@ function copydb() {
   # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
   # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
 
-  all_create_view_commands=()
+  all_create_view_statements=()
 
   entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'`  # Get the tables and views without any potential the "WARN" logs.
   for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
-    # Check if this is a view by showing the create-command where it should print "create view" for a view, not the "create table". Unfortunately, there is now "show views" command.
-    create_entity_command=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line command.
+    # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
+    create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
 
-    create_view_command_test=`echo -e "$create_entity_command" | grep 'CREATE VIEW'`
-    if [ -n "$create_view_command_test" ]; then
-      echo -e "\n'${i}' is a view, so we will save its 'create view' command and execute it on Impala, after all tables have been created.\n"
-      create_view_command=`echo -e "$create_entity_command" | sed 's/WARN:.*//g' | sed 's/\`//g' \
+    create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+    if [ -n "$create_view_statement_test" ]; then
+      echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
+      create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
         | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
         | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
         | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
-      all_create_view_commands+=("$create_view_command")
+      all_create_view_statements+=("$create_view_statement")
     else
       echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
       CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' |  head -1`
       if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
-          echo -e "\n\nTHE TABLE \"${i}\" HAD NO PARQUET FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
+          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
       else
         impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
         log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
         if [ -n "$log_errors" ]; then
-          echo -e "\n\nTHERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
         fi
       fi
     fi
@@ -144,65 +145,62 @@ function copydb() {
 
   echo -e "\nAll tables have been created, going to create the views..\n"
 
-  # Make Impala aware of the new tables.
-  sleep 1
-  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
-  sleep 1
-
   # Time to loop through the views and create them.
   # At this point all table-schemas should have been created.
 
-  previous_num_of_views_to_retry=${#all_create_view_commands}
+  previous_num_of_views_to_retry=${#all_create_view_statements}
   if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
-    echo -e "\nAll_create_view_commands:\n\n${all_create_view_commands[@]}\n"  # DEBUG
+    echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n"  # DEBUG
+    # Make Impala aware of the new tables, so it knows them when creating the views.
+    sleep 1
+    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
+    sleep 1
   else
-    echo -e "\nDB '${db}' does not contain views.\n"
+    echo -e "\nDB '${db}' does not contain any views.\n"
   fi
 
   level_counter=0
-  while [[ ${#all_create_view_commands[@]} -gt 0 ]]; do
+  while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
     ((level_counter++))
     # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
     # In this case, we should retry creating this particular view again.
-    should_retry_create_view_commands=()
+    should_retry_create_view_statements=()
 
-    for create_view_command in "${all_create_view_commands[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
-      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_command}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
+    for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
+      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
       specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
       if [ -n "$specific_errors" ]; then
         echo -e "\nspecific_errors: ${specific_errors}\n"
         echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
-        should_retry_create_view_commands+=("$create_view_command")
+        should_retry_create_view_statements+=("$create_view_statement")
       else
           sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
       fi
     done
 
-    new_num_of_views_to_retry=${#should_retry_create_view_commands}
+    new_num_of_views_to_retry=${#should_retry_create_view_statements}
     if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
-      echo -e "THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING.."
+      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
       return 3
     elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
-      echo -e "\nTo be retried \"create_view_commands\":\n\n${should_retry_create_view_commands[@]}\n"
+      echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
       previous_num_of_views_to_retry=$new_num_of_views_to_retry
     else
-      echo -e "\nFinished creating views db: ${db}, in level-${level_counter}.\n"
+      echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
     fi
-    all_create_view_commands=("${should_retry_create_view_command[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
+    all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
   done
 
   sleep 1
   impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
   sleep 1
 
-  echo "Computing stats for tables.."
-
+  echo -e "\nComputing stats for tables..\n"
   entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
-
   for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
     # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
-    create_view_command=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
-    if [ -z "$create_view_command" ]; then  # If it's a table, then go load the data to it.
+    create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"`  # This grep works here, as we do not want to match multiple-lines.
+    if [ -z "$create_view_statement" ]; then  # If it's a table, then go load the data to it.
       impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
     fi
   done
@@ -210,13 +208,12 @@ function copydb() {
   if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
     echo -e "\nAll entities have been copied to Impala cluster.\n"
   else
-    echo -e "\n\n1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
+    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
     rm -f error.log
     return 4
   fi
 
   rm -f error.log
-
   echo -e "\n\nFinished processing db: ${db}\n\n"
 }
 

From 43b454399f2099912dcc31961f03dce6ce2b41cd Mon Sep 17 00:00:00 2001
From: Giambattista Bloisi <giambattista.bloisi@openaire.eu>
Date: Mon, 15 Apr 2024 18:19:29 +0200
Subject: [PATCH 6/8] - Bug fix in matchOrderedTokenAndAbbreviations algorithms
 where tokens with same initial character were always considered equal -
 AuthorsMatch exploits the new matching strategy used for ORCID enhancements
 in #PR398: split author names in tokens, order the tokens, then check for
 matches of ordered full tokens or abbreviations

---
 .../dhp/schema/oaf/utils/MergeUtils.java      |   7 +-
 .../eu/dnetlib/pace/tree/AuthorsMatch.java    |  45 ++++--
 .../eu/dnetlib/pace/util/AuthorMatchers.scala |  53 ++++++-
 .../dnetlib/dhp/oa/dedup/SparkDedupTest.java  | 145 ++++++++++--------
 .../enrich/orcid/ORCIDAuthorEnricher.scala    |  11 +-
 .../orcid/ORCIDAuthorMatchersTest.scala       |   2 +-
 6 files changed, 169 insertions(+), 94 deletions(-)
 rename dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala => dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala (56%)

diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
index 0ff90e024..316891faf 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java
@@ -497,9 +497,14 @@ public class MergeUtils {
 	}
 
 	private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
+		if (d1 == null || StringUtils.isBlank(d1.getValue())) {
+			return d2;
+		} else if (d2 == null || StringUtils.isBlank(d2.getValue())) {
+			return d1;
+		}
+
 		return Stream
 			.of(d1, d2)
-			.filter(Objects::nonNull)
 			.min(
 				Comparator
 					.comparing(
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
index edad0ae2e..0921d7a64 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@@ -1,16 +1,18 @@
 
 package eu.dnetlib.pace.tree;
 
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-
 import com.wcohen.ss.AbstractStringDistance;
-
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.model.Person;
 import eu.dnetlib.pace.tree.support.AbstractListComparator;
 import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.util.AuthorMatchers;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.function.BiFunction;
+import java.util.stream.Collectors;
 
 @ComparatorClass("authorsMatch")
 public class AuthorsMatch extends AbstractListComparator {
@@ -41,24 +43,36 @@ public class AuthorsMatch extends AbstractListComparator {
 	}
 
 	@Override
-	public double compare(final List<String> a, final List<String> b, final Config conf) {
-		if (a.isEmpty() || b.isEmpty())
+	public double compare(final List<String> left, final List<String> right, final Config conf) {
+		if (left.isEmpty() || right.isEmpty())
 			return -1;
 
-		if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
+		if (left.size() > SIZE_THRESHOLD || right.size() > SIZE_THRESHOLD)
 			return 1.0;
 
-		int maxMiss = Integer.MAX_VALUE;
-		List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
-
 		Double threshold = getDoubleParam("threshold");
+		int maxMiss = Integer.MAX_VALUE;
 
-		if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) {
-			maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size()));
+		if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && left.size() == right.size()) {
+			maxMiss = (int) Math.floor((1 - threshold) * Math.max(left.size(), right.size()));
 		}
 
 		int common = 0;
+
+		List<String> a = new ArrayList<>(left);
+		List<String> b = new ArrayList<>(right);
+
+		common += AuthorMatchers
+			.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchEqualsIgnoreCase)
+			.size() / 2;
+		common += AuthorMatchers
+			.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchOrderedTokenAndAbbreviations)
+			.size() / 2;
+
+		List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+
 		// compare each element of List1 with each element of List2
+		int alreadyMatched = common;
 		for (int i = 0; i < a.size(); i++) {
 			Person p1 = new Person(a.get(i), false);
 
@@ -123,13 +137,13 @@ public class AuthorsMatch extends AbstractListComparator {
 				}
 			}
 
-			if (i - common > maxMiss) {
+			if (i - common - alreadyMatched > maxMiss) {
 				return 0.0;
 			}
 		}
 
 		// normalization factor to compute the score
-		int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common);
+		int normFactor = left.size() == right.size() ? left.size() : (left.size() + right.size() - common);
 
 		if (TYPE.equals("percentage")) {
 			return (double) common / normFactor;
@@ -160,5 +174,4 @@ public class AuthorsMatch extends AbstractListComparator {
 	public String normalization(String s) {
 		return normalize(utf8(cleanup(s)));
 	}
-
 }
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala
similarity index 56%
rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala
rename to dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala
index 49574fe2d..116f515ed 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchers.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/AuthorMatchers.scala
@@ -1,9 +1,10 @@
-package eu.dnetlib.dhp.enrich.orcid
+package eu.dnetlib.pace.util
 
 import java.util.Locale
 import java.util.regex.Pattern
+import scala.util.control.Breaks.{break, breakable}
 
-object ORCIDAuthorMatchers {
+object AuthorMatchers {
   val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
 
   val WORD_DIFF = 2
@@ -45,7 +46,8 @@ object ORCIDAuthorMatchers {
         var res: Boolean = false
         if (e1.length != 1 && e2.length != 1) {
           res = e1 == e2
-          longMatches += 1
+          if (res)
+            longMatches += 1
         } else {
           res = true
           shortMatches += 1
@@ -62,4 +64,49 @@ object ORCIDAuthorMatchers {
     }
     longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length)
   }
+
+  def removeMatches(
+                     graph_authors: java.util.List[String],
+                     orcid_authors: java.util.List[String],
+                     matchingFunc: java.util.function.BiFunction[String,String,Boolean]
+                   ) : java.util.List[String] = {
+    removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
+  }
+
+
+  def removeMatches(
+                                       graph_authors: java.util.List[String],
+                                       orcid_authors: java.util.List[String],
+                                       matchingFunc: (String, String) => Boolean
+                                     ) : java.util.List[String]  = {
+    val matched = new java.util.ArrayList[String]()
+
+    if (graph_authors != null && !graph_authors.isEmpty) {
+      val ait = graph_authors.iterator
+
+      while (ait.hasNext) {
+        val author = ait.next()
+        val oit = orcid_authors.iterator
+
+        breakable {
+          while (oit.hasNext) {
+            val orcid = oit.next()
+
+            if (matchingFunc(author, orcid)) {
+              ait.remove()
+              oit.remove()
+
+              matched.add(author)
+              matched.add(orcid)
+
+              break()
+            }
+          }
+        }
+      }
+    }
+
+    matched
+  }
+
 }
diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
index 8b3480e60..2c96b7399 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java
@@ -43,15 +43,13 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
-import eu.dnetlib.dhp.schema.sx.OafUtils;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
-import scala.Tuple2;
 
 @ExtendWith(MockitoExtension.class)
 @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
 public class SparkDedupTest implements Serializable {
+	static final boolean CHECK_CARDINALITIES = true;
 
 	@Mock(serializable = true)
 	ISLookUpService isLookUpService;
@@ -191,12 +189,13 @@ public class SparkDedupTest implements Serializable {
 		System.out.println("ds_simrel = " + ds_simrel);
 		System.out.println("orp_simrel = " + orp_simrel);
 
-		assertEquals(751, orgs_simrel);
-		assertEquals(546, pubs_simrel);
-		assertEquals(113, sw_simrel);
-		assertEquals(148, ds_simrel);
-		assertEquals(280, orp_simrel);
-
+		if (CHECK_CARDINALITIES) {
+			assertEquals(751, orgs_simrel);
+			assertEquals(566, pubs_simrel);
+			assertEquals(113, sw_simrel);
+			assertEquals(148, ds_simrel);
+			assertEquals(280, orp_simrel);
+		}
 	}
 
 	@Test
@@ -239,21 +238,27 @@ public class SparkDedupTest implements Serializable {
 			.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct"))
 			.count();
 
-		// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
-		assertEquals(751, orgs_simrel);
-		assertEquals(546, pubs_simrel);
-		assertEquals(148, ds_simrel);
-		assertEquals(280, orp_simrel);
-//		System.out.println("orgs_simrel = " + orgs_simrel);
-//		System.out.println("pubs_simrel = " + pubs_simrel);
-//		System.out.println("ds_simrel = " + ds_simrel);
-//		System.out.println("orp_simrel = " + orp_simrel);
-
 		// entities simrels to be different from the number of previous step (new simrels in the whitelist)
 		Dataset<Row> sw_simrel = spark
 			.read()
 			.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "software"));
 
+		System.out.println("orgs_simrel = " + orgs_simrel);
+		System.out.println("pubs_simrel = " + pubs_simrel);
+		System.out.println("ds_simrel = " + ds_simrel);
+		System.out.println("orp_simrel = " + orp_simrel);
+		System.out.println("sw_simrel = " + sw_simrel.count());
+
+		// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
+		if (CHECK_CARDINALITIES) {
+			assertEquals(751, orgs_simrel);
+			assertEquals(566, pubs_simrel);
+			assertEquals(148, ds_simrel);
+			assertEquals(280, orp_simrel);
+			assertEquals(115, sw_simrel.count());
+		}
+
+
 		// check if the first relation in the whitelist exists
 		assertTrue(
 			sw_simrel
@@ -272,10 +277,6 @@ public class SparkDedupTest implements Serializable {
 					rel -> rel.getSource().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[0])
 						&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
 				.count() > 0);
-
-		assertEquals(115, sw_simrel.count());
-//		System.out.println("sw_simrel = " + sw_simrel.count());
-
 	}
 
 	@Test
@@ -466,17 +467,19 @@ public class SparkDedupTest implements Serializable {
 			assertTrue(dups.contains(r.getSource()));
 		});
 
-		assertEquals(1268, orgs_mergerel);
-		assertEquals(1112, pubs.count());
-		assertEquals(292, sw_mergerel);
-		assertEquals(476, ds_mergerel);
-		assertEquals(742, orp_mergerel);
-//		System.out.println("orgs_mergerel = " + orgs_mergerel);
-//		System.out.println("pubs_mergerel = " + pubs_mergerel);
-//		System.out.println("sw_mergerel = " + sw_mergerel);
-//		System.out.println("ds_mergerel = " + ds_mergerel);
-//		System.out.println("orp_mergerel = " + orp_mergerel);
+		System.out.println("orgs_mergerel = " + orgs_mergerel);
+		System.out.println("pubs_mergerel = " + pubs.count());
+		System.out.println("sw_mergerel = " + sw_mergerel);
+		System.out.println("ds_mergerel = " + ds_mergerel);
+		System.out.println("orp_mergerel = " + orp_mergerel);
 
+		if (CHECK_CARDINALITIES) {
+			assertEquals(1268, orgs_mergerel);
+			assertEquals(1156, pubs.count());
+			assertEquals(292, sw_mergerel);
+			assertEquals(476, ds_mergerel);
+			assertEquals(742, orp_mergerel);
+		}
 	}
 
 	@Test
@@ -552,17 +555,19 @@ public class SparkDedupTest implements Serializable {
 			assertTrue(dups.contains(r.getSource()));
 		});
 
-		assertEquals(1268, orgs_mergerel);
-		assertEquals(1112, pubs.count());
-		assertEquals(292, sw_mergerel);
-		assertEquals(476, ds_mergerel);
-		assertEquals(742, orp_mergerel);
-//		System.out.println("orgs_mergerel = " + orgs_mergerel);
-//		System.out.println("pubs_mergerel = " + pubs_mergerel);
-//		System.out.println("sw_mergerel = " + sw_mergerel);
-//		System.out.println("ds_mergerel = " + ds_mergerel);
-//		System.out.println("orp_mergerel = " + orp_mergerel);
+		System.out.println("orgs_mergerel = " + orgs_mergerel);
+		System.out.println("pubs_mergerel = " + pubs.count());
+		System.out.println("sw_mergerel = " + sw_mergerel);
+		System.out.println("ds_mergerel = " + ds_mergerel);
+		System.out.println("orp_mergerel = " + orp_mergerel);
 
+		if (CHECK_CARDINALITIES) {
+			assertEquals(1268, orgs_mergerel);
+			assertEquals(1156, pubs.count());
+			assertEquals(292, sw_mergerel);
+			assertEquals(476, ds_mergerel);
+			assertEquals(742, orp_mergerel);
+		}
 	}
 
 	@Test
@@ -607,19 +612,21 @@ public class SparkDedupTest implements Serializable {
 				testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
 			.count();
 
-		assertEquals(86, orgs_deduprecord);
-		assertEquals(91, pubs.count());
-		assertEquals(47, sw_deduprecord);
-		assertEquals(97, ds_deduprecord);
-		assertEquals(92, orp_deduprecord);
+		System.out.println("orgs_deduprecord = " + orgs_deduprecord);
+		System.out.println("pubs_deduprecord = " + pubs.count());
+		System.out.println("sw_deduprecord = " + sw_deduprecord);
+		System.out.println("ds_deduprecord = " + ds_deduprecord);
+		System.out.println("orp_deduprecord = " + orp_deduprecord);
+
+		if (CHECK_CARDINALITIES) {
+			assertEquals(86, orgs_deduprecord);
+			assertEquals(96, pubs.count());
+			assertEquals(47, sw_deduprecord);
+			assertEquals(97, ds_deduprecord);
+			assertEquals(92, orp_deduprecord);
+		}
 
 		verifyRoot_1(mapper, pubs);
-
-//		System.out.println("orgs_deduprecord = " + orgs_deduprecord);
-//		System.out.println("pubs_deduprecord = " + pubs_deduprecord);
-//		System.out.println("sw_deduprecord = " + sw_deduprecord);
-//		System.out.println("ds_deduprecord = " + ds_deduprecord);
-//		System.out.println("orp_deduprecord = " + orp_deduprecord);
 	}
 
 	private static void verifyRoot_1(ObjectMapper mapper, Dataset<Publication> pubs) {
@@ -745,21 +752,23 @@ public class SparkDedupTest implements Serializable {
 			.distinct()
 			.count();
 
-		assertEquals(925, publications);
-		assertEquals(839, organizations);
-		assertEquals(100, projects);
-		assertEquals(100, datasource);
-		assertEquals(196, softwares);
-		assertEquals(389, dataset);
-		assertEquals(520, otherresearchproduct);
+		System.out.println("publications = " + publications);
+		System.out.println("organizations = " + organizations);
+		System.out.println("projects = " + projects);
+		System.out.println("datasource = " + datasource);
+		System.out.println("software = " + softwares);
+		System.out.println("dataset = " + dataset);
+		System.out.println("otherresearchproduct = " + otherresearchproduct);
 
-//		System.out.println("publications = " + publications);
-//		System.out.println("organizations = " + organizations);
-//		System.out.println("projects = " + projects);
-//		System.out.println("datasource = " + datasource);
-//		System.out.println("software = " + softwares);
-//		System.out.println("dataset = " + dataset);
-//		System.out.println("otherresearchproduct = " + otherresearchproduct);
+		if (CHECK_CARDINALITIES) {
+			assertEquals(930, publications);
+			assertEquals(839, organizations);
+			assertEquals(100, projects);
+			assertEquals(100, datasource);
+			assertEquals(196, softwares);
+			assertEquals(389, dataset);
+			assertEquals(520, otherresearchproduct);
+		}
 
 		long deletedOrgs = jsc
 			.textFile(testDedupGraphBasePath + "/organization")
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala
index e2e7fada6..2e23a3a59 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorEnricher.scala
@@ -3,6 +3,7 @@ package eu.dnetlib.dhp.enrich.orcid
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.{Author, StructuredProperty}
 import eu.dnetlib.dhp.schema.sx.OafUtils
+import eu.dnetlib.pace.util.AuthorMatchers
 
 import java.util
 import scala.beans.BeanProperty
@@ -39,7 +40,7 @@ object ORCIDAuthorEnricher extends Serializable {
         unmatched_authors,
         orcid_authors,
         (author, orcid) =>
-          ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
+          AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.givenName + " " + orcid.familyName),
         "fullName"
       ) ++
       // Look after exact reversed fullname match, reconstruct ORCID fullname as familyName + givenName
@@ -47,7 +48,7 @@ object ORCIDAuthorEnricher extends Serializable {
         unmatched_authors,
         orcid_authors,
         (author, orcid) =>
-          ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
+          AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.familyName + " " + orcid.givenName),
         "reversedFullName"
       ) ++
       // split author names in tokens, order the tokens, then check for matches of full tokens or abbreviations
@@ -55,7 +56,7 @@ object ORCIDAuthorEnricher extends Serializable {
         unmatched_authors,
         orcid_authors,
         (author, orcid) =>
-          ORCIDAuthorMatchers
+          AuthorMatchers
             .matchOrderedTokenAndAbbreviations(author.getFullname, orcid.givenName + " " + orcid.familyName),
         "orderedTokens"
       ) ++
@@ -63,7 +64,7 @@ object ORCIDAuthorEnricher extends Serializable {
       extractAndEnrichMatches(
         unmatched_authors,
         orcid_authors,
-        (author, orcid) => ORCIDAuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
+        (author, orcid) => AuthorMatchers.matchEqualsIgnoreCase(author.getFullname, orcid.creditName),
         "creditName"
       ) ++
       // look after exact matches in  ORCID otherNames
@@ -71,7 +72,7 @@ object ORCIDAuthorEnricher extends Serializable {
         unmatched_authors,
         orcid_authors,
         (author, orcid) =>
-          orcid.otherNames != null && ORCIDAuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
+          orcid.otherNames != null && AuthorMatchers.matchOtherNames(author.getFullname, orcid.otherNames.asScala),
         "otherNames"
       )
     }
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
index f109ebe24..f3a5fe77c 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala
@@ -1,6 +1,6 @@
 package eu.dnetlib.dhp.enrich.orcid
 
-import eu.dnetlib.dhp.enrich.orcid.ORCIDAuthorMatchers.matchOrderedTokenAndAbbreviations
+import eu.dnetlib.pace.util.AuthorMatchers.matchOrderedTokenAndAbbreviations
 import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
 import org.junit.jupiter.api.Test
 

From 78b9d84e4a380699d7316a528eac49bcc7a3dbb9 Mon Sep 17 00:00:00 2001
From: "michele.artini" <michele.artini@isti.cnr.it>
Date: Tue, 16 Apr 2024 09:41:16 +0200
Subject: [PATCH 7/8] test

---
 .../plugin/rest/OsfPreprintCollectorTest.java | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java

diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
new file mode 100644
index 000000000..2f0263a0d
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@@ -0,0 +1,84 @@
+package eu.dnetlib.dhp.collection.plugin.rest;
+
+import java.util.HashMap;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
+import eu.dnetlib.dhp.common.collection.CollectorException;
+import eu.dnetlib.dhp.common.collection.HttpClientParams;
+
+public class OsfPreprintCollectorTest {
+
+	private static final Logger log = LoggerFactory.getLogger(OsfPreprintCollectorTest.class);
+
+	private final String baseUrl = "https://api.osf.io/v2/preprints/";
+
+	// private final String requestHeaderMap = "";
+	// private final String authMethod = "";
+	// private final String authToken = "";
+	// private final String resultOutputFormat = "";
+
+	private final String queryParams = "filter:is_published:d=true";
+
+	private final String entityXpath = "/*/*[local-name()='data']";
+
+	private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";
+
+	private final String resumptionParam = "page";
+	private final String resumptionType = "page";
+	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
+
+	private final String resultSizeParam = "";
+	private final String resultSizeValue = "";
+
+	private final String resultFormatParam = "format";
+	private final String resultFormatValue = "json";
+
+	private final ApiDescriptor api = new ApiDescriptor();
+	private RestCollectorPlugin rcp;
+
+	@BeforeEach
+	public void setUp() {
+		final HashMap<String, String> params = new HashMap<>();
+		params.put("resumptionType", this.resumptionType);
+		params.put("resumptionParam", this.resumptionParam);
+		params.put("resumptionXpath", this.resumptionXpath);
+		params.put("resultTotalXpath", this.resultTotalXpath);
+		params.put("resultFormatParam", this.resultFormatParam);
+		params.put("resultFormatValue", this.resultFormatValue);
+		params.put("resultSizeParam", this.resultSizeParam);
+		params.put("resultSizeValue", this.resultSizeValue);
+		params.put("queryParams", this.queryParams);
+		params.put("entityXpath", this.entityXpath);
+
+		this.api.setBaseUrl(this.baseUrl);
+		this.api.setParams(params);
+
+		this.rcp = new RestCollectorPlugin(new HttpClientParams());
+	}
+
+	@Test
+	@Disabled
+	void test() throws CollectorException {
+		final AtomicInteger i = new AtomicInteger(0);
+		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
+
+		stream.limit(200).forEach(s -> {
+			Assertions.assertTrue(s.length() > 0);
+			i.incrementAndGet();
+			log.info(s);
+		});
+
+		log.info("{}", i.intValue());
+		Assertions.assertTrue(i.intValue() > 0);
+	}
+}

From d070db4a32c80d7715d5eddccde68341a3ac7314 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Tue, 16 Apr 2024 09:41:59 +0200
Subject: [PATCH 8/8] added a couple more invalid author names

---
 .../eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
index 9386db933..2be4e8e0c 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@@ -92,6 +92,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 		INVALID_AUTHOR_NAMES.add("null anonymous");
 		INVALID_AUTHOR_NAMES.add("unbekannt");
 		INVALID_AUTHOR_NAMES.add("unknown");
+		INVALID_AUTHOR_NAMES.add("autor, Sin");
+		INVALID_AUTHOR_NAMES.add("Desconocido / Inconnu,");
 
 		INVALID_URL_HOSTS.add("creativecommons.org");
 		INVALID_URL_HOSTS.add("www.academia.edu");