2021-12-13 22:05:00 +01:00
export PYTHON_EGG_CACHE = /home/$( whoami) /.python-eggs
export link_folder = /tmp/impala-shell-python-egg-cache-$( whoami)
if ! [ -L $link_folder ]
then
rm -Rf " $link_folder "
ln -sfn ${ PYTHON_EGG_CACHE } ${ link_folder } ${ link_folder }
fi
2023-06-14 18:09:09 +02:00
export HADOOP_USER_NAME = $6
2023-05-18 08:33:05 +02:00
export PROD_USAGE_STATS_DB = "openaire_prod_usage_stats"
2021-12-13 22:05:00 +01:00
function copydb( ) {
db = $1
2023-05-12 11:31:13 +02:00
FILE = ( "hive_wf_tmp_" $RANDOM )
hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE /
2021-12-13 22:05:00 +01:00
# copy the databases from ocean to impala
2023-02-20 08:29:20 +01:00
echo " copying $db "
2023-05-12 11:31:13 +02:00
hadoop distcp -Dmapreduce.map.memory.mb= 6144 -pb hdfs://nameservice1/user/hive/warehouse/${ db } .db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE /
2021-12-13 22:05:00 +01:00
# change ownership to impala
2023-05-12 11:31:13 +02:00
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE /${ db } .db
2021-12-13 22:05:00 +01:00
2023-06-14 18:09:09 +02:00
# drop tables from db
2023-06-14 18:23:42 +02:00
for i in ` impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${ db } --delimited -q "show tables" ` ;
2023-06-14 18:09:09 +02:00
do
2023-07-13 14:25:00 +02:00
` impala-shell -i impala-cluster-dn1.openaire.eu -d ${ db } -q " drop table $i ; " ` ;
2023-06-14 18:09:09 +02:00
done
# drop views from db
2023-06-14 18:23:42 +02:00
for i in ` impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${ db } --delimited -q "show tables" ` ;
2023-06-14 18:09:09 +02:00
do
2023-07-13 14:25:00 +02:00
` impala-shell -i impala-cluster-dn1.openaire.eu -d ${ db } -q " drop view $i ; " ` ;
2023-06-14 18:09:09 +02:00
done
# delete the database
2023-05-15 11:51:44 +02:00
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q " drop database if exists ${ db } cascade " ;
2023-06-14 18:09:09 +02:00
# create the databases
2023-05-15 11:51:44 +02:00
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q " create database ${ db } " ;
2021-12-13 22:05:00 +01:00
2023-05-15 11:51:44 +02:00
impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
2021-12-13 22:05:00 +01:00
echo " creating schema for ${ db } "
2023-05-18 08:33:05 +02:00
for ( ( k = 0; k < 5; k ++ ) ) ; do
2023-05-15 11:51:44 +02:00
for i in ` impala-shell --user $HADOOP_USER_NAME -d ${ db } --delimited -q "show tables" ` ;
2021-12-13 22:05:00 +01:00
do
2023-05-15 11:51:44 +02:00
impala-shell --user $HADOOP_USER_NAME -d ${ db } --delimited -q " show create table $i " ;
done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
2023-05-18 08:33:05 +02:00
done
2021-12-13 22:05:00 +01:00
2023-05-18 08:33:05 +02:00
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
# do
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
#
# # run the same command twice because we may have failures in the first run (due to views pointing to the same db)
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
# do
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
2021-12-13 22:05:00 +01:00
# load the data from /tmp in the respective tables
echo "copying data in tables and computing stats"
2023-05-15 11:51:44 +02:00
for i in ` impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${ db } --delimited -q "show tables" ` ;
2021-12-13 22:05:00 +01:00
do
2023-05-15 11:51:44 +02:00
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${ db } -q " load data inpath '/tmp/ $FILE / ${ db } .db/ $i ' into table $i " ;
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${ db } -q " compute stats $i " ;
2021-12-13 22:05:00 +01:00
done
# deleting the remaining directory from hdfs
2023-05-12 11:31:13 +02:00
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE /${ db } .db
2021-12-13 22:05:00 +01:00
}
STATS_DB = $1
MONITOR_DB = $2
OBSERVATORY_DB = $3
2023-02-13 08:44:00 +01:00
EXT_DB = $4
2023-05-18 08:33:05 +02:00
USAGE_STATS_DB = $5
HADOOP_USER_NAME = $6
2021-12-13 22:05:00 +01:00
2023-05-18 08:33:05 +02:00
copydb $USAGE_STATS_DB
copydb $PROD_USAGE_STATS_DB
2023-02-13 08:44:00 +01:00
copydb $EXT_DB
2021-12-13 22:05:00 +01:00
copydb $STATS_DB
2023-07-13 14:25:00 +02:00
copydb $MONITOR_DB
2021-12-13 22:05:00 +01:00
copydb $OBSERVATORY_DB
2023-06-02 12:34:16 +02:00
copydb $MONITOR_DB '_funded'
copydb $MONITOR_DB '_institutions'
2023-07-13 14:25:00 +02:00
copydb $MONITOR_DB '_ris_tail'
2023-06-02 12:34:16 +02:00
contexts = "knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${ contexts }
do
tmp = ` echo " $i " | sed 's/' -'/' _'/g' | sed 's/' ::'/' _'/g' `
copydb ${ MONITOR_DB } '_' ${ tmp }
done