dnet-hadoop/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh

export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]; then
    rm -Rf "$link_folder"
    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi

export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"


# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"

IMPALA_HDFS_NODE=''
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then
    IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then
    IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
else
    IMPALA_HDFS_NODE='PROBLEM WHEN SETTING THE HDFS-NODE!'
fi
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}"

IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'


function copydb() {
  db=$1
  TEMP_SUBDIR=("hive_wf_tmp_"$RANDOM)
  hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/${TEMP_SUBDIR}/
  # copy the databases from ocean to impala

  echo "copying $db"
  hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/${TEMP_SUBDIR}/

  # Give all permissions to to impala
  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 /tmp/${TEMP_SUBDIR}/${db}.db


# Delete the old DB from Impala cluster.
  # drop tables from db
  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited  -q "show tables"`; do
    `impala-shell  -i ${IMPALA_HOSTNAME} -d ${db} -q "drop table ${i};"`;
  done

  # drop views from db
  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited  -q "show tables"`; do
    `impala-shell  -i ${IMPALA_HOSTNAME} -d ${db} -q "drop view ${i};"`;
  done

  # delete the database
  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade";

  # create the new databases
  impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}";

  impala-shell --user ${HADOOP_USER_NAME} -q "INVALIDATE METADATA"

# Create schemas.
  echo "creating schema for ${db}"

  # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
  # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the schema itself from that file.

  # The "2-times-loop" is there to retry creating the views for which their tables have not been created yet.
  # Since there are many DBs handled in this script and many more may be added, we cannot easily keep track of the views of all those tables, so leave this extra loop for now.

  for (( k = 1; k <= 2; k++ )); do
    echo -e "\nCreate tables iteration_${k}\n"
    for i in `hive -e "use $db; show tables;" | sed 's/WARN:.*//g'`; do
      CURRENT_PRQ_FILE=`hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -ls -C "/tmp/${TEMP_SUBDIR}/${db}.db/${i}/*.parq" | head -1`
      if [[ -z "$CURRENT_PRQ_FILE" ]]; then
        echo -e "The table \"${i}\" had no parquet files to get the schema from!\n"
        continue
      fi
      impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "create table ${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;"
    done
  done

  # load the data from /tmp in the respective tables
  echo "copying data in tables and computing stats"
  for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited  -q "show tables"`; do
    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "invalidate metadata ${i};"
    sleep 1;  # Sometimes we need a small break between commands, otherwise we get an error.
    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "load data inpath '/tmp/${TEMP_SUBDIR}/${db}.db/${i}' into table ${i}";
    sleep 1;
    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "invalidate metadata ${i};"
    sleep 1;
    impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "compute stats ${i}";
  done

  # deleting the remaining directory from hdfs
  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -rm -R /tmp/${TEMP_SUBDIR}/${db}.db
}

STATS_DB=$1
MONITOR_DB=$2
OBSERVATORY_DB=$3
EXT_DB=$4
USAGE_STATS_DB=$5
HADOOP_USER_NAME=$6

copydb $USAGE_STATS_DB
copydb $PROD_USAGE_STATS_DB
copydb $EXT_DB
copydb $STATS_DB
copydb $MONITOR_DB
copydb $OBSERVATORY_DB

copydb $MONITOR_DB'_funded'
copydb $MONITOR_DB'_institutions'
copydb $MONITOR_DB'_ris_tail'

contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
   tmp=`echo "$i"  | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
  copydb ${MONITOR_DB}'_'${tmp}
done