1
0
Fork 0
dnet-hadoop/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh

123 lines
5.1 KiB
Bash

export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]; then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
IMPALA_HDFS_NODE=''
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
else
IMPALA_HDFS_NODE='PROBLEM WHEN SETTING THE HDFS-NODE!'
fi
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
function copydb() {
db=$1
TEMP_SUBDIR=("hive_wf_tmp_"$RANDOM)
hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/${TEMP_SUBDIR}/
# copy the databases from ocean to impala
echo "copying $db"
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/${TEMP_SUBDIR}/
# Give all permissions to to impala
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 /tmp/${TEMP_SUBDIR}/${db}.db
# Delete the old DB from Impala cluster.
# drop tables from db
for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited -q "show tables"`; do
`impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop table ${i};"`;
done
# drop views from db
for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited -q "show tables"`; do
`impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop view ${i};"`;
done
# delete the database
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade";
# create the new databases
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}";
impala-shell --user ${HADOOP_USER_NAME} -q "INVALIDATE METADATA"
# Create schemas.
echo "creating schema for ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the schema itself from that file.
# The "2-times-loop" is there to retry creating the views for which their tables have not been created yet.
# Since there are many DBs handled in this script and many more may be added, we cannot easily keep track of the views of all those tables, so leave this extra loop for now.
for (( k = 1; k <= 2; k++ )); do
echo -e "\nCreate tables iteration_${k}\n"
for i in `hive -e "use $db; show tables;" | sed 's/WARN:.*//g'`; do
CURRENT_PRQ_FILE=`hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -ls -C "/tmp/${TEMP_SUBDIR}/${db}.db/${i}/*.parq" | head -1`
if [[ -z "$CURRENT_PRQ_FILE" ]]; then
echo -e "The table \"${i}\" had no parquet files to get the schema from!\n"
continue
fi
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "create table ${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;"
done
done
# load the data from /tmp in the respective tables
echo "copying data in tables and computing stats"
for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited -q "show tables"`; do
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "invalidate metadata ${i};"
sleep 1; # Sometimes we need a small break between commands, otherwise we get an error.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "load data inpath '/tmp/${TEMP_SUBDIR}/${db}.db/${i}' into table ${i}";
sleep 1;
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "invalidate metadata ${i};"
sleep 1;
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "compute stats ${i}";
done
# deleting the remaining directory from hdfs
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -rm -R /tmp/${TEMP_SUBDIR}/${db}.db
}
STATS_DB=$1
MONITOR_DB=$2
OBSERVATORY_DB=$3
EXT_DB=$4
USAGE_STATS_DB=$5
HADOOP_USER_NAME=$6
copydb $USAGE_STATS_DB
copydb $PROD_USAGE_STATS_DB
copydb $EXT_DB
copydb $STATS_DB
copydb $MONITOR_DB
copydb $OBSERVATORY_DB
copydb $MONITOR_DB'_funded'
copydb $MONITOR_DB'_institutions'
copydb $MONITOR_DB'_ris_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
copydb ${MONITOR_DB}'_'${tmp}
done