forked from D-Net/dnet-hadoop
123 lines
5.1 KiB
Bash
123 lines
5.1 KiB
Bash
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
|
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
|
if ! [ -L $link_folder ]; then
|
|
rm -Rf "$link_folder"
|
|
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
|
fi
|
|
|
|
export HADOOP_USER_NAME=$6
|
|
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
|
|
|
|
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
|
echo "OCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
|
|
|
IMPALA_HDFS_NODE=''
|
|
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then
|
|
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
|
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then
|
|
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
|
|
else
|
|
IMPALA_HDFS_NODE='PROBLEM WHEN SETTING THE HDFS-NODE!'
|
|
fi
|
|
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}"
|
|
|
|
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
|
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
|
|
|
|
|
|
|
function copydb() {
|
|
db=$1
|
|
TEMP_SUBDIR=("hive_wf_tmp_"$RANDOM)
|
|
hdfs dfs -mkdir ${IMPALA_HDFS_NODE}/tmp/${TEMP_SUBDIR}/
|
|
# copy the databases from ocean to impala
|
|
|
|
echo "copying $db"
|
|
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_NODE}/tmp/${TEMP_SUBDIR}/
|
|
|
|
# Give all permissions to to impala
|
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 /tmp/${TEMP_SUBDIR}/${db}.db
|
|
|
|
|
|
# Delete the old DB from Impala cluster.
|
|
# drop tables from db
|
|
for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited -q "show tables"`; do
|
|
`impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop table ${i};"`;
|
|
done
|
|
|
|
# drop views from db
|
|
for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited -q "show tables"`; do
|
|
`impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop view ${i};"`;
|
|
done
|
|
|
|
# delete the database
|
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade";
|
|
|
|
# create the new databases
|
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}";
|
|
|
|
impala-shell --user ${HADOOP_USER_NAME} -q "INVALIDATE METADATA"
|
|
|
|
# Create schemas.
|
|
echo "creating schema for ${db}"
|
|
|
|
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
|
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the schema itself from that file.
|
|
|
|
# The "2-times-loop" is there to retry creating the views for which their tables have not been created yet.
|
|
# Since there are many DBs handled in this script and many more may be added, we cannot easily keep track of the views of all those tables, so leave this extra loop for now.
|
|
|
|
for (( k = 1; k <= 2; k++ )); do
|
|
echo -e "\nCreate tables iteration_${k}\n"
|
|
for i in `hive -e "use $db; show tables;" | sed 's/WARN:.*//g'`; do
|
|
CURRENT_PRQ_FILE=`hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -ls -C "/tmp/${TEMP_SUBDIR}/${db}.db/${i}/*.parq" | head -1`
|
|
if [[ -z "$CURRENT_PRQ_FILE" ]]; then
|
|
echo -e "The table \"${i}\" had no parquet files to get the schema from!\n"
|
|
continue
|
|
fi
|
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "create table ${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;"
|
|
done
|
|
done
|
|
|
|
# load the data from /tmp in the respective tables
|
|
echo "copying data in tables and computing stats"
|
|
for i in `impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} --delimited -q "show tables"`; do
|
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "invalidate metadata ${i};"
|
|
sleep 1; # Sometimes we need a small break between commands, otherwise we get an error.
|
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "load data inpath '/tmp/${TEMP_SUBDIR}/${db}.db/${i}' into table ${i}";
|
|
sleep 1;
|
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "invalidate metadata ${i};"
|
|
sleep 1;
|
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -d ${db} -q "compute stats ${i}";
|
|
done
|
|
|
|
# deleting the remaining directory from hdfs
|
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -rm -R /tmp/${TEMP_SUBDIR}/${db}.db
|
|
}
|
|
|
|
STATS_DB=$1
|
|
MONITOR_DB=$2
|
|
OBSERVATORY_DB=$3
|
|
EXT_DB=$4
|
|
USAGE_STATS_DB=$5
|
|
HADOOP_USER_NAME=$6
|
|
|
|
copydb $USAGE_STATS_DB
|
|
copydb $PROD_USAGE_STATS_DB
|
|
copydb $EXT_DB
|
|
copydb $STATS_DB
|
|
copydb $MONITOR_DB
|
|
copydb $OBSERVATORY_DB
|
|
|
|
copydb $MONITOR_DB'_funded'
|
|
copydb $MONITOR_DB'_institutions'
|
|
copydb $MONITOR_DB'_ris_tail'
|
|
|
|
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
|
|
for i in ${contexts}
|
|
do
|
|
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
|
copydb ${MONITOR_DB}'_'${tmp}
|
|
done |