[stats wf] merged workflows from branch beta

This commit is contained in:
Claudio Atzori 2023-06-27 14:32:48 +02:00
parent b93e1541aa
commit e10ce92fe5
26 changed files with 1146 additions and 647 deletions

View File

@ -10,7 +10,7 @@ export SOURCE=$1
export PRODUCTION=$2 export PRODUCTION=$2
echo "Updating ${PRODUCTION} database" echo "Updating ${PRODUCTION} database"
impala-shell -q "create database if not exists ${PRODUCTION}" impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
echo "Production db ready!" echo "Production db ready!"

View File

@ -0,0 +1,38 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export SOURCE=$1
export PRODUCTION=$2
echo "Updating ${PRODUCTION}'_funded' database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}'_funded'"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}'_funded' -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}'_funded'./" | sed "s/$/;/" | impala-shell -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}'_funded' -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}'_funded'.\1 as select * from ${SOURCE}'_funded'.\1;/" | impala-shell -c -f -
echo "Production funded db ready!"
echo "Updating ${PRODUCTION}'_institutions' database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}'_institutions'"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}'_institutions' -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}'_institutions'./" | sed "s/$/;/" | impala-shell -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}'_institutions' -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}'_institutions'.\1 as select * from ${SOURCE}'_institutions'.\1;/" | impala-shell -c -f -
echo "Production insitutions db ready!"
echo "Updating ${PRODUCTION}'_ris_tail' database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}'_ris_tail'"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}'_ris_tail' -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}'_ris_tail'./" | sed "s/$/;/" | impala-shell -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}'_RIs_tail' -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}'_ris_tail'.\1 as select * from ${SOURCE}'_ris_tail'.\1;/" | impala-shell -c -f -
echo "Production RIS tail db ready!"
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}'_'${tmp}"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}'_'${tmp} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}'_'${tmp}./" | sed "s/$/;/" | impala-shell -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}'_'${tmp} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}'_'${tmp}.\1 as select * from ${SOURCE}'_'${tmp}.\1;/" | impala-shell -c -f -
echo "Production ${tmp} db ready!"
done

View File

@ -80,10 +80,10 @@
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>updateProductionViews.sh</exec> <exec>updateProductionViewsMonitor.sh</exec>
<argument>${monitor_db_name}</argument> <argument>${monitor_db_name}</argument>
<argument>${monitor_db_production_name}</argument> <argument>${monitor_db_production_name}</argument>
<file>updateProductionViews.sh</file> <file>updateProductionViewsMonitor.sh</file>
</shell> </shell>
<ok to="updateObservatoryViews"/> <ok to="updateObservatoryViews"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -21,7 +21,7 @@
</property> </property>
<property> <property>
<name>hive_jdbc_url</name> <name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228</value> <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property> </property>
<property> <property>
<name>oozie.wf.workflow.notification.url</name> <name>oozie.wf.workflow.notification.url</name>

View File

@ -9,6 +9,8 @@ fi
CONTEXT_API=$1 CONTEXT_API=$1
TARGET_DB=$2 TARGET_DB=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450"
TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6` TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6`
echo "Downloading context ids" echo "Downloading context ids"
@ -29,21 +31,20 @@ hdfs dfs -copyFromLocal categories.csv ${TMP}
hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -copyFromLocal concepts.csv ${TMP}
hdfs dfs -chmod -R 777 ${TMP} hdfs dfs -chmod -R 777 ${TMP}
export HADOOP_USER="oozie"
export HADOOP_USER_NAME="oozie"
echo "Creating and populating impala tables" echo "Creating and populating impala tables"
impala-shell -q "invalidate metadata" hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
impala-shell -d ${TARGET_DB} -q "invalidate metadata" hive $HIVE_OPTS -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
impala-shell -d ${TARGET_DB} -q "invalidate metadata" hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
echo "Cleaning up" echo "Cleaning up"
hdfs dfs -rm -f -r -skipTrash ${TMP}
rm concepts.csv rm concepts.csv
rm categories.csv rm categories.csv
rm contexts.csv rm contexts.csv
echo "Finito!" echo "Finito!"

View File

@ -0,0 +1,97 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
#export HADOOP_USER_NAME="dimitris.pierrakos"
export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
function copydb() {
db=$1
FILE=("hive_wf_tmp_"$RANDOM)
hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/
# copy the databases from ocean to impala
echo "copying $db"
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/
# change ownership to impala
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
# drop tables from db
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
do
`impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop table $i;"`;
done
# drop views from db
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
do
`impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop view $i;"`;
done
# delete the database
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
# create the databases
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
echo "creating schema for ${db}"
for (( k = 0; k < 5; k ++ )); do
for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
do
impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
done
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
# do
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
#
# # run the same command twice because we may have failures in the first run (due to views pointing to the same db)
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
# do
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
# load the data from /tmp in the respective tables
echo "copying data in tables and computing stats"
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
do
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
done
# deleting the remaining directory from hdfs
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
}
STATS_DB=$1
MONITOR_DB=$2
OBSERVATORY_DB=$3
EXT_DB=$4
USAGE_STATS_DB=$5
HADOOP_USER_NAME=$6
copydb $USAGE_STATS_DB
copydb $PROD_USAGE_STATS_DB
copydb $EXT_DB
copydb $STATS_DB
#copydb $MONITOR_DB
copydb $OBSERVATORY_DB
copydb $MONITOR_DB'_funded'
copydb $MONITOR_DB'_institutions'
copydb $MONITOR_DB'_RIs_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
copydb ${MONITOR_DB}'_'${tmp}
done

View File

@ -0,0 +1,48 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
function createShadowDB() {
SOURCE=$1
SHADOW=$2
# drop views from db
for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} --delimited -q "show tables"`;
do
`impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${SHADOW} -q "drop view $i;"`;
done
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE";
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}";
# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
}
STATS_DB=$1
STATS_DB_SHADOW=$2
MONITOR_DB=$3
MONITOR_DB_SHADOW=$4
OBSERVATORY_DB=$5
OBSERVATORY_DB_SHADOW=$6
USAGE_STATS_DB=$7
USAGE_STATS_DB_SHADOW=$8
createShadowDB $STATS_DB $STATS_DB_SHADOW
createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW
createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW
createShadowDB USAGE_STATS_DB USAGE_STATS_DB_SHADOW
createShadowDB $MONITOR_DB'_funded' $MONITOR_DB'_funded_shadow'
createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB'_institutions_shadow'
createShadowDB $MONITOR_DB'_RIs_tail' $MONITOR_DB'_RIs_tail_shadow'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
createShadowDB ${MONITOR_DB}'_'${tmp} ${MONITOR_DB}'_'${tmp}'_shadow'
done

View File

@ -8,12 +8,12 @@ fi
export SOURCE=$1 export SOURCE=$1
export SHADOW=$2 export SHADOW=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Updating shadow database" echo "Updating shadow database"
impala-shell -q "invalidate metadata" hive -e "drop database if exists ${SHADOW} cascade"
impala-shell -d ${SOURCE} -q "invalidate metadata" hive -e "create database if not exists ${SHADOW}"
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
impala-shell -q "create database if not exists ${SHADOW}" hive -f foo
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - echo "Updated shadow database"
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
echo "Shadow db ready!"

View File

@ -7,13 +7,17 @@ then
fi fi
export TARGET=$1 export TARGET=$1
export SCRIPT_PATH=$2 export STATS_EXT=$2
export SCRIPT_PATH=$3
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH
echo "Creating indicators" echo "Creating indicators"
impala-shell -q "invalidate metadata" hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/STATS_EXT/${STATS_EXT}/g" |sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - hive $HIVE_OPTS -f foo
cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql
echo "Indicators created" echo "Indicators created"

View File

@ -0,0 +1,19 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export SOURCE=$1
export SHADOW=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Updating shadow database"
hive -e "drop database if exists ${SHADOW} cascade"
hive -e "create database if not exists ${SHADOW}"
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
hive -f foo
echo "Updated shadow database"

View File

@ -10,16 +10,88 @@ export SOURCE=$1
export TARGET=$2 export TARGET=$2
export SHADOW=$3 export SHADOW=$3
export SCRIPT_PATH=$4 export SCRIPT_PATH=$4
export SCRIPT_PATH2=$5
export SCRIPT_PATH3=$6
export SCRIPT_PATH4=$7
export SCRIPT_PATH5=$8
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $4 echo "Getting file from " $4
hdfs dfs -copyToLocal $4 hdfs dfs -copyToLocal $4
echo "Creating monitor database" echo "Getting file from " $5
cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - hdfs dfs -copyToLocal $5
echo "Impala shell finished"
echo "Updating shadow monitor database" echo "Getting file from " $6
impala-shell -q "create database if not exists ${SHADOW}" hdfs dfs -copyToLocal $6
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - echo "Getting file from " $7
echo "Shadow db ready!" hdfs dfs -copyToLocal $7
echo "Getting file from " $8
hdfs dfs -copyToLocal $8
echo "Creating monitor database"
cat step20-createMonitorDB_funded.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo
hive $HIVE_OPTS -f foo
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo
hive $HIVE_OPTS -f foo
#
cat step20-createMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
hive $HIVE_OPTS -f foo
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
hive $HIVE_OPTS -f foo
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
tmp2=`echo "$i" |sed 's/:.*//' `
cat step20-createMonitorDB_RIs.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_$tmp/g1" | sed "s/CONTEXT/\'%$tmp2%\'/g" > foo
hive $HIVE_OPTS -f foo
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_$tmp/g1" > foo
hive $HIVE_OPTS -f foo
done
cat step20-createMonitorDB_RIs_tail.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" | sed "s/CONTEXTS/\"'knowmad::other','dh-ch::other', 'enermaps::other', 'gotriple::other', 'neanias-atmospheric::other', 'rural-digital-europe::other', 'covid-19::other', 'aurora::other', 'neanias-space::other', 'north-america-studies::other', 'north-american-studies::other', 'eutopia::other'\"/g" > foo
hive $HIVE_OPTS -f foo
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"
echo "Updating shadow monitor funded database"
hive -e "drop database if exists ${SHADOW}_funded cascade"
hive -e "create database if not exists ${SHADOW}_funded"
hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_funded.\1 as select * from ${2}_funded.\1;/" > foo
hive -f foo
echo "Updated shadow monitor funded database"
echo "Updating shadow monitor insitutions database"
hive -e "drop database if exists ${SHADOW}_institutions cascade"
hive -e "create database if not exists ${SHADOW}_institutions"
hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
hive -f foo
echo "Shadow db monitor insitutions ready!"
echo "Updating shadow monitor RIs database"
for i in $contexts
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
hive -e "drop database if exists ${SHADOW}_${tmp} cascade"
hive -e "create database if not exists ${SHADOW}_${tmp}"
hive $HIVE_OPTS --database ${2}_${tmp} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_${tmp}.\1 as select * from ${2}_${tmp}.\1;/" > foo
hive -f foo
done
echo "Shadow db monitor RIs ready!"
echo "Updating shadow monitor RIs tail database"
hive -e "drop database if exists ${SHADOW}_ris_tail cascade"
hive -e "create database if not exists ${SHADOW}_ris_tail"
hive $HIVE_OPTS --database ${2}_ris_tail -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_ris_tail.\1 as select * from ${2}_ris_tail.\1;/" > foo
hive -f foo
echo "Shadow db monitor RIs tail ready!"

View File

@ -7,15 +7,13 @@ then
fi fi
export SOURCE=$1 export SOURCE=$1
export TARGET=$2 export SHADOW=$2
export SHADOW=$3 export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
impala-shell -q "invalidate metadata;" echo "Updating shadow database"
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - hive -e "drop database if exists ${SHADOW} cascade"
echo "Impala shell finished" hive -e "create database if not exists ${SHADOW}"
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo
echo "Updating shadow observatory database" hive -f foo
impala-shell -q "create database if not exists ${SHADOW}" echo "Updated shadow database"
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f -
echo "Shadow db ready!"

View File

@ -10,7 +10,11 @@ export SOURCE=$1
export TARGET=$2 export TARGET=$2
export SHADOW=$3 export SHADOW=$3
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Creating observatory database" echo "Creating observatory database"
impala-shell -q "drop database if exists ${TARGET} cascade" hive -e "drop database if exists ${TARGET} cascade"
impala-shell -q "create database if not exists ${TARGET}" hive -e "create database if not exists ${TARGET}"
impala-shell -d ${SOURCE} -q "show tables" --delimited | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo
hive -f foo

View File

@ -46,4 +46,8 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute;

View File

@ -33,4 +33,12 @@ select * from ${stats_db_name}.dataset_refereed
union all union all
select * from ${stats_db_name}.software_refereed select * from ${stats_db_name}.software_refereed
union all union all
select * from ${stats_db_name}.otherresearchproduct_refereed; select * from ${stats_db_name}.otherresearchproduct_refereed;
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
where measures_ids.id!='views' and measures_ids.id!='downloads';
ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS;

View File

@ -1,21 +1,21 @@
------------------------------------------- -------------------------------------------
--- Extra tables, mostly used by indicators --- Extra tables, mostly used by indicators
create table ${stats_db_name}.result_projectcount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
select r.id, count(distinct p.id) as count select r.id, count(distinct p.id) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id;
create table ${stats_db_name}.result_fundercount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
select r.id, count(distinct p.funder) as count select r.id, count(distinct p.funder) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id;
create table ${stats_db_name}.project_resultcount STORED AS PARQUET as create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
with rcount as ( with rcount as (
select p.id as pid, count(distinct r.id) as `count`, r.type as type select p.id as pid, count(distinct r.id) as `count`, r.type as type
from ${stats_db_name}.project p from ${stats_db_name}.project p
@ -29,17 +29,22 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
from rcount from rcount
group by rcount.pid; group by rcount.pid;
create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
create table ${stats_db_name}.result_instance stored as parquet as create table if not exists ${stats_db_name}.result_instance stored as parquet as
select distinct r.* select distinct r.*
from ( from (
select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom, select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
join ${stats_db_name}.result res on res.id=r.id; join ${stats_db_name}.result res on res.id=r.id;
create table ${stats_db_name}.result_apc as create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
select r.id, r.amount, r.currency select r.id, r.amount, r.currency
from ( from (
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
@ -47,4 +52,4 @@ from (
join ${stats_db_name}.result res on res.id=r.id join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null; where r.amount is not null;
create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;

View File

@ -1,5 +1,78 @@
drop database if exists TARGET cascade; --drop database if exists TARGET cascade;
create database if not exists TARGET; --create database if not exists TARGET;
--
--create view if not exists TARGET.category as select * from SOURCE.category;
--create view if not exists TARGET.concept as select * from SOURCE.concept;
--create view if not exists TARGET.context as select * from SOURCE.context;
--create view if not exists TARGET.country as select * from SOURCE.country;
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
--create view if not exists TARGET.funder as select * from SOURCE.funder;
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--
--create table TARGET.result stored as parquet as
-- select distinct * from (
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
-- ) )) foo;
--
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.concept as select * from SOURCE.concept;
@ -10,128 +83,90 @@ create view if not exists TARGET.creation_date as select * from SOURCE.creation_
create view if not exists TARGET.funder as select * from SOURCE.funder; create view if not exists TARGET.funder as select * from SOURCE.funder;
create view if not exists TARGET.fundref as select * from SOURCE.fundref; create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create table TARGET.result stored as parquet as create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
select distinct * from ( create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
union all create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan
) )) foo;
compute stats TARGET.result;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations; ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_references_oc; ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations_oc; ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_classifications; ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_apc; ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_concepts; ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_datasources; ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fundercount; ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_gold; ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_greenoa; ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_languages; ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_licenses; ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_oids; ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_organization; ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_peerreviewed; ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_pids; ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_projectcount; ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_projects; ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_refereed; ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_sources; ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics; ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fos; ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1; drop view TARGET.foo1;
drop view TARGET.foo2; drop view TARGET.foo2;
compute stats TARGET.result_result; ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
-- datasources -- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource; create view if not exists TARGET.datasource as select * from SOURCE.datasource;
@ -140,7 +175,7 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
compute stats TARGET.datasource_results; ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
-- organizations -- organizations
create view if not exists TARGET.organization as select * from SOURCE.organization; create view if not exists TARGET.organization as select * from SOURCE.organization;
@ -155,30 +190,31 @@ create view if not exists TARGET.project_oids as select * from SOURCE.project_oi
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
compute stats TARGET.project_results; ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
-- indicators -- indicators
-- Sprint 1 ---- -- Sprint 1 ----
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_green_oa; ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_grey_lit; ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_doi_from_crossref; ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
-- Sprint 2 ---- -- Sprint 2 ----
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence; ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence_url; ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_has_abstract; ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_with_orcid; ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
---- Sprint 3 ---- ---- Sprint 3 ----
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_funded_result_with_fundref; ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
@ -187,30 +223,32 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
---- Sprint 4 ---- ---- Sprint 4 ----
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_diamond; ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_in_transformative; ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_closed_other_open; ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
---- Sprint 5 ---- ---- Sprint 5 ----
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_no_of_copies; ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ---- ---- Sprint 6 ----
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_hybrid_oa_with_cc; ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads; ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_datasource; ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_year; ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_datasource_year; ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
---- Sprint 7 ---- ---- Sprint 7 ----
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_gold_oa; ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_hybrid; ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
@ -221,11 +259,12 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--compute stats TARGET.indi_datasets_gold_oa; ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--compute stats TARGET.indi_software_gold_oa; ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;

View File

@ -0,0 +1,15 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists
(select 1
from SOURCE.result_concepts rc
join SOURCE.concept conc on conc.id=rc.concept
join SOURCE.category cat on cat.id=conc.category
join SOURCE.context cont on cont.id=cat.context
-- join SOURCE.result
where rc.id=r.id and conc.category like CONTEXT)
) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -0,0 +1,15 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists
(select 1
from SOURCE.result_concepts rc
join SOURCE.concept conc on conc.id=rc.concept
join SOURCE.category cat on cat.id=conc.category
join SOURCE.context cont on cont.id=cat.context
-- join SOURCE.result
where rc.id=r.id and conc.category not in (CONTEXTS))
) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -0,0 +1,9 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -0,0 +1,58 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
'openorgs____::3cff625a4370d51e08624cc586138b2f', -- IMT Atlantique
'openorgs____::c0b262bd6eab819e4c994914f9c010e2', -- National Institute of Geophysics and Volcanology
'openorgs____::1624ff7c01bb641b91f4518539a0c28a' -- Vrije Universiteit Amsterdam
))) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -8,6 +8,8 @@ from ${stats_db_name}.result r
group by rl.id group by rl.id
) rln on rln.id=r.id; ) rln on rln.id=r.id;
ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_country stored as parquet as create table ${observatory_db_name}.result_affiliated_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -37,6 +39,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_year stored as parquet as create table ${observatory_db_name}.result_affiliated_year stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -66,6 +70,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -95,6 +101,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -126,6 +134,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -157,6 +167,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_organization stored as parquet as create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -186,6 +198,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -215,6 +229,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_funder stored as parquet as create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -246,6 +262,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -277,6 +295,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_country stored as parquet as create table ${observatory_db_name}.result_deposited_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -308,6 +328,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_year stored as parquet as create table ${observatory_db_name}.result_deposited_year stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -339,6 +361,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_year_country stored as parquet as create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -370,6 +394,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_datasource stored as parquet as create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -401,6 +427,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -432,6 +460,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_organization stored as parquet as create table ${observatory_db_name}.result_deposited_organization stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -463,6 +493,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -494,6 +526,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_funder stored as parquet as create table ${observatory_db_name}.result_deposited_funder stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -527,6 +561,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -558,4 +594,6 @@ from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS;

View File

@ -84,4 +84,12 @@ create table ${stats_db_name}.funder STORED AS PARQUET as
select distinct xpath_string(fund, '//funder/id') as id, select distinct xpath_string(fund, '//funder/id') as id,
xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/name') as name,
xpath_string(fund, '//funder/shortname') as shortname xpath_string(fund, '//funder/shortname') as shortname
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund;
CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS
SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization,
properties[0].value contribution, properties[1].value currency
from ${openaire_db_name}.relation r
LATERAL VIEW explode (r.properties) properties
where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%'
and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;

View File

@ -48,12 +48,10 @@ WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false;
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
-- Creating a temporary dual table that will be removed after the following insert -- Creating a temporary dual table that will be removed after the following insert
CREATE TABLE ${stats_db_name}.dual CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1));
(
dummy CHAR(1) INSERT INTO ${stats_db_name}.dual VALUES ('X');
);
INSERT INTO ${stats_db_name}.dual
VALUES ('X');
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
SELECT 'other', SELECT 'other',
@ -73,12 +71,8 @@ FROM ${stats_db_name}.dual
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
DROP TABLE ${stats_db_name}.dual; DROP TABLE ${stats_db_name}.dual;
UPDATE ${stats_db_name}.datasource_tmp UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository';
SET name='Other' UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1';
WHERE name = 'Unknown Repository';
UPDATE ${stats_db_name}.datasource_tmp
SET yearofvalidation=null
WHERE yearofvalidation = '-1';
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, langs.languages AS language SELECT substr(d.id, 4) AS id, langs.languages AS language
@ -104,4 +98,4 @@ where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources; FROM ${stats_db_name}.result_datasources;

View File

@ -1,4 +1,4 @@
<workflow-app name="Graph Stats" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Graph Stats Hive" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>stats_db_name</name> <name>stats_db_name</name>
@ -10,9 +10,12 @@
</property> </property>
<property> <property>
<name>external_stats_db_name</name> <name>external_stats_db_name</name>
<value>stats_ext</value>
<description>the external stats that should be added since they are not included in the graph database</description> <description>the external stats that should be added since they are not included in the graph database</description>
</property> </property>
<property>
<name>usage_stats_db_name</name>
<description>the usage statistics database name</description>
</property>
<property> <property>
<name>stats_db_shadow_name</name> <name>stats_db_shadow_name</name>
<description>the name of the shadow schema</description> <description>the name of the shadow schema</description>
@ -33,6 +36,10 @@
<name>observatory_db_shadow_name</name> <name>observatory_db_shadow_name</name>
<description>the name of the shadow monitor db</description> <description>the name of the shadow monitor db</description>
</property> </property>
<property>
<name>usage_stats_db_shadow_name</name>
<description>the name of the shadow usage stats db</description>
</property>
<property> <property>
<name>stats_tool_api_url</name> <name>stats_tool_api_url</name>
<description>The url of the API of the stats tool. Is used to trigger the cache update.</description> <description>The url of the API of the stats tool. Is used to trigger the cache update.</description>
@ -53,6 +60,10 @@
<name>context_api_url</name> <name>context_api_url</name>
<description>the base url of the context api (https://services.openaire.eu/openaire)</description> <description>the base url of the context api (https://services.openaire.eu/openaire)</description>
</property> </property>
<property>
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
</parameters> </parameters>
<global> <global>
@ -67,10 +78,47 @@
<name>hive.txn.timeout</name> <name>hive.txn.timeout</name>
<value>${hive_timeout}</value> <value>${hive_timeout}</value>
</property> </property>
<property>
<name>mapred.job.queue.name</name>
<value>analytics</value>
</property>
</configuration> </configuration>
</global> </global>
<start to="Step1"/> <start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="Step1">${wf:conf('resumeFrom') eq 'Step1'}</case>
<case to="Step2">${wf:conf('resumeFrom') eq 'Step2'}</case>
<case to="Step3">${wf:conf('resumeFrom') eq 'Step3'}</case>
<case to="Step4">${wf:conf('resumeFrom') eq 'Step4'}</case>
<case to="Step5">${wf:conf('resumeFrom') eq 'Step5'}</case>
<case to="Step6">${wf:conf('resumeFrom') eq 'Step6'}</case>
<case to="Step7">${wf:conf('resumeFrom') eq 'Step7'}</case>
<case to="Step8">${wf:conf('resumeFrom') eq 'Step8'}</case>
<case to="Step9">${wf:conf('resumeFrom') eq 'Step9'}</case>
<case to="Step10">${wf:conf('resumeFrom') eq 'Step10'}</case>
<case to="Step11">${wf:conf('resumeFrom') eq 'Step11'}</case>
<case to="Step12">${wf:conf('resumeFrom') eq 'Step12'}</case>
<case to="Step13">${wf:conf('resumeFrom') eq 'Step13'}</case>
<case to="Step14">${wf:conf('resumeFrom') eq 'Step14'}</case>
<case to="Step15">${wf:conf('resumeFrom') eq 'Step15'}</case>
<case to="Step15_5">${wf:conf('resumeFrom') eq 'Step15_5'}</case>
<case to="Contexts">${wf:conf('resumeFrom') eq 'Contexts'}</case>
<case to="Step16-createIndicatorsTables">${wf:conf('resumeFrom') eq 'Step16-createIndicatorsTables'}</case>
<case to="Step16_1-definitions">${wf:conf('resumeFrom') eq 'Step16_1-definitions'}</case>
<case to="Step16_5">${wf:conf('resumeFrom') eq 'Step16_5'}</case>
<case to="Step19-finalize">${wf:conf('resumeFrom') eq 'Step19-finalize'}</case>
<case to="step20-createMonitorDB">${wf:conf('resumeFrom') eq 'step20-createMonitorDB'}</case>
<case to="step21-createObservatoryDB-pre">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'}</case>
<case to="step21-createObservatoryDB">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'}</case>
<case to="step21-createObservatoryDB-post">${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'}</case>
<case to="step22-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'}</case>
<case to="step23-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'}</case>
<case to="Step24-updateCache">${wf:conf('resumeFrom') eq 'Step24-updateCache'}</case>
<default to="Step1"/>
</switch>
</decision>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -249,6 +297,7 @@
<script>scripts/step15_5.sql</script> <script>scripts/step15_5.sql</script>
<param>stats_db_name=${stats_db_name}</param> <param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param> <param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2> </hive2>
<ok to="Contexts"/> <ok to="Contexts"/>
<error to="Kill"/> <error to="Kill"/>
@ -273,6 +322,7 @@
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>indicators.sh</exec> <exec>indicators.sh</exec>
<argument>${stats_db_name}</argument> <argument>${stats_db_name}</argument>
<argument>${external_stats_db_name}</argument>
<argument>${wf:appPath()}/scripts/step16-createIndicatorsTables.sql</argument> <argument>${wf:appPath()}/scripts/step16-createIndicatorsTables.sql</argument>
<file>indicators.sh</file> <file>indicators.sh</file>
</shell> </shell>
@ -301,7 +351,7 @@
<ok to="Step19-finalize"/> <ok to="Step19-finalize"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step19-finalize"> <action name="Step19-finalize">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
@ -324,12 +374,29 @@
<argument>${monitor_db_name}</argument> <argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument> <argument>${monitor_db_shadow_name}</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument> <argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_funded.sql</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_institutions.sql</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_RIs.sql</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_RIs_tail.sql</argument>
<file>monitor.sh</file> <file>monitor.sh</file>
</shell> </shell>
<ok to="step21-createObservatoryDB-pre"/> <ok to="step21-createObservatoryDB-pre"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="step20-createMonitorDB-post">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>monitor-post.sh</exec>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${monitor_db_shadow_name}</argument>-->
<!-- <file>monitor-post.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step21-createObservatoryDB-pre"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="step21-createObservatoryDB-pre"> <action name="step21-createObservatoryDB-pre">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
@ -360,16 +427,53 @@
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>observatory-post.sh</exec> <exec>observatory-post.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${observatory_db_name}</argument> <argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument> <argument>${observatory_db_shadow_name}</argument>
<file>observatory-post.sh</file> <file>observatory-post.sh</file>
</shell> </shell>
<ok to="Step22"/> <ok to="step22-copyDataToImpalaCluster"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="Step22"> <action name="step22-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<!-- <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
<!-- <argument>${external_stats_db_name}</argument>-->
<argument>${stats_db_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${external_stats_db_name}</argument>
<argument>${usage_stats_db_name}</argument>
<argument>${hadoop_user_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="step23-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="step23-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${stats_db_shadow_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<argument>${usage_stats_db_name}</argument>
<argument>${usage_stats_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="Step24-updateCache"/>
<error to="Kill"/>
</action>
<action name="Step24-updateCache">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
@ -382,4 +486,4 @@
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>