Merge pull request 'fixed the result_country definition and updated the stats DB copy procedure' (#412) from antonis.lempesis/dnet-hadoop:beta into master

Reviewed-on: #412
This commit is contained in:
Claudio Atzori 2024-04-03 12:34:17 +02:00
commit 5add51f38c
7 changed files with 90 additions and 39 deletions

View File

@ -9,15 +9,27 @@ fi
export HADOOP_USER_NAME=$2 export HADOOP_USER_NAME=$2
IMPALA_HDFS_NODE='' IMPALA_HDFS_NODE=''
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
else break
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER!\n\n" else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
exit 1 exit 1
fi fi
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}" echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
function copydb() { function copydb() {

View File

@ -9,15 +9,28 @@ fi
export HADOOP_USER_NAME=$2 export HADOOP_USER_NAME=$2
IMPALA_HDFS_NODE='' IMPALA_HDFS_NODE=''
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
else break
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER!\n\n" else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
exit 1 exit 1
fi fi
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}" echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
function copydb() { function copydb() {

View File

@ -9,15 +9,28 @@ fi
#export HADOOP_USER_NAME=$2 #export HADOOP_USER_NAME=$2
IMPALA_HDFS_NODE='' IMPALA_HDFS_NODE=''
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
else break
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER!\n\n" else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
exit 1 exit 1
fi fi
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}" echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
function copydb() { function copydb() {

View File

@ -7,15 +7,28 @@ then
fi fi
IMPALA_HDFS_NODE='' IMPALA_HDFS_NODE=''
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu >/dev/null 2>&1; then COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020' IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu >/dev/null 2>&1; then break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020' IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
else break
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER!\n\n" else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nPROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! $COUNTER\n\n"
exit 1 exit 1
fi fi
echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE}" echo "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries."
export HADOOP_USER_NAME=$6 export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"

View File

@ -85,12 +85,12 @@ hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/
hive -f foo hive -f foo
echo "Updated shadow monitor funded database" echo "Updated shadow monitor funded database"
echo "Updating shadow monitor insitutions database" echo "Updating shadow monitor institutions database"
hive -e "drop database if exists ${SHADOW}_institutions cascade" hive -e "drop database if exists ${SHADOW}_institutions cascade"
hive -e "create database if not exists ${SHADOW}_institutions" hive -e "create database if not exists ${SHADOW}_institutions"
hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
hive -f foo hive -f foo
echo "Shadow db monitor insitutions ready!" echo "Shadow db monitor institutions ready!"
echo "Updating shadow monitor RIs database" echo "Updating shadow monitor RIs database"
for i in $contexts for i in $contexts

View File

@ -335,8 +335,8 @@ select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar from allresults ar
join result_fair rf on rf.organization=ar.organization; /*EOS*/ join result_fair rf on rf.organization=ar.organization; /*EOS*/
DROP VIEW result_fair; /*EOS*/ DROP VIEW result_fair;
DROP VIEW allresults; /*EOS*/ DROP VIEW allresults;
CREATE TEMPORARY VIEW result_fair as CREATE TEMPORARY VIEW result_fair as
select year, ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro select year, ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro
@ -1000,13 +1000,13 @@ left outer join (
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
create table ${stats_db_name}.result_country stored as parquet as create table ${stats_db_name}.result_country stored as parquet as
select distinct ro.id, coalesce(o.country, f.country) as country select distinct ro.id, coalesce(o.country, f.country)
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
left outer join ${stats_db_name}.organization o on o.id=ro.organization left outer join ${stats_db_name}.organization o on o.id=ro.organization
left outer join ${stats_db_name}.result_projects rp on rp.id=ro.id left outer join ${stats_db_name}.result_projects rp on rp.id=ro.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
left outer join ${stats_db_name}.funder f on f.name=p.funder left outer join ${stats_db_name}.funder f on f.name=p.funder
where coalesce(o.country, f.country) IS NOT NULL; /*EOS*/ where coalesce(o.country, f.country) IS NOT NULL;
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as