diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh index 3e510e87e..a6d7b289d 100644 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh @@ -10,7 +10,7 @@ export SOURCE=$1 export PRODUCTION=$2 echo "Updating ${PRODUCTION} database" -impala-shell -q "create database if not exists ${PRODUCTION}" -impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}" +impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - +impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - echo "Production db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViewsMonitor.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViewsMonitor.sh new file mode 100644 index 000000000..68844b14c --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViewsMonitor.sh @@ -0,0 +1,38 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export PRODUCTION=$2 + +echo "Updating ${PRODUCTION}'_funded' database" +impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}'_funded'" +impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}'_funded' -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}'_funded'./" | sed "s/$/;/" | impala-shell -c -f - +impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}'_funded' -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}'_funded'.\1 as select * from ${SOURCE}'_funded'.\1;/" | impala-shell -c -f - +echo "Production funded db ready!" + +echo "Updating ${PRODUCTION}'_institutions' database" +impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}'_institutions'" +impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}'_institutions' -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}'_institutions'./" | sed "s/$/;/" | impala-shell -c -f - +impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}'_institutions' -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}'_institutions'.\1 as select * from ${SOURCE}'_institutions'.\1;/" | impala-shell -c -f - +echo "Production insitutions db ready!" + +echo "Updating ${PRODUCTION}'_ris_tail' database" +impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}'_ris_tail'" +impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}'_ris_tail' -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}'_ris_tail'./" | sed "s/$/;/" | impala-shell -c -f - +impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}'_RIs_tail' -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}'_ris_tail'.\1 as select * from ${SOURCE}'_ris_tail'.\1;/" | impala-shell -c -f - +echo "Production RIS tail db ready!" + +contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" +for i in ${contexts} +do + tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}'_'${tmp}" + impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}'_'${tmp} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}'_'${tmp}./" | sed "s/$/;/" | impala-shell -c -f - + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}'_'${tmp} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}'_'${tmp}.\1 as select * from ${SOURCE}'_'${tmp}.\1;/" | impala-shell -c -f - + echo "Production ${tmp} db ready!" +done diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 8286e5039..605c86ac9 100644 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -80,10 +80,10 @@ ${jobTracker} ${nameNode} - updateProductionViews.sh + updateProductionViewsMonitor.sh ${monitor_db_name} ${monitor_db_production_name} - updateProductionViews.sh + updateProductionViewsMonitor.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml index 63fc84d75..98a182175 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml @@ -21,7 +21,7 @@ hive_jdbc_url - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228 + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228 oozie.wf.workflow.notification.url diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index b66ab47e0..a436d0380 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -9,6 +9,8 @@ fi CONTEXT_API=$1 TARGET_DB=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450" + TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 93faa43d6..2f1eefa0c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -7,13 +7,17 @@ then fi export TARGET=$1 -export SCRIPT_PATH=$2 +export STATS_EXT=$2 +export SCRIPT_PATH=$3 + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false" +export HADOOP_USER_NAME="oozie" echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -impala-shell -q "invalidate metadata" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - -cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - -echo "Indicators created" \ No newline at end of file +hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/STATS_EXT/${STATS_EXT}/g" |sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive $HIVE_OPTS -f foo +hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql +echo "Indicators created" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh new file mode 100644 index 000000000..5863625a1 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh @@ -0,0 +1,19 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export SHADOW=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Updating shadow database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index c5bda6d39..08f4c9232 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -10,16 +10,88 @@ export SOURCE=$1 export TARGET=$2 export SHADOW=$3 export SCRIPT_PATH=$4 +export SCRIPT_PATH2=$5 +export SCRIPT_PATH3=$6 +export SCRIPT_PATH4=$7 +export SCRIPT_PATH5=$8 + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" echo "Getting file from " $4 hdfs dfs -copyToLocal $4 -echo "Creating monitor database" -cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - -echo "Impala shell finished" +echo "Getting file from " $5 +hdfs dfs -copyToLocal $5 -echo "Updating shadow monitor database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - -echo "Shadow db ready!" \ No newline at end of file +echo "Getting file from " $6 +hdfs dfs -copyToLocal $6 + +echo "Getting file from " $7 +hdfs dfs -copyToLocal $7 + +echo "Getting file from " $8 +hdfs dfs -copyToLocal $8 + +echo "Creating monitor database" +cat step20-createMonitorDB_funded.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo +hive $HIVE_OPTS -f foo +cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo +hive $HIVE_OPTS -f foo +# +cat step20-createMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo +hive $HIVE_OPTS -f foo +cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo +hive $HIVE_OPTS -f foo + +contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" + +for i in ${contexts} +do + tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` + tmp2=`echo "$i" |sed 's/:.*//' ` + cat step20-createMonitorDB_RIs.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_$tmp/g1" | sed "s/CONTEXT/\'%$tmp2%\'/g" > foo + hive $HIVE_OPTS -f foo + cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_$tmp/g1" > foo + hive $HIVE_OPTS -f foo +done + + +cat step20-createMonitorDB_RIs_tail.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" | sed "s/CONTEXTS/\"'knowmad::other','dh-ch::other', 'enermaps::other', 'gotriple::other', 'neanias-atmospheric::other', 'rural-digital-europe::other', 'covid-19::other', 'aurora::other', 'neanias-space::other', 'north-america-studies::other', 'north-american-studies::other', 'eutopia::other'\"/g" > foo +hive $HIVE_OPTS -f foo +cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" > foo +hive $HIVE_OPTS -f foo + +echo "Hive shell finished" + +echo "Updating shadow monitor funded database" +hive -e "drop database if exists ${SHADOW}_funded cascade" +hive -e "create database if not exists ${SHADOW}_funded" +hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_funded.\1 as select * from ${2}_funded.\1;/" > foo +hive -f foo +echo "Updated shadow monitor funded database" + +echo "Updating shadow monitor insitutions database" +hive -e "drop database if exists ${SHADOW}_institutions cascade" +hive -e "create database if not exists ${SHADOW}_institutions" +hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo +hive -f foo +echo "Shadow db monitor insitutions ready!" + +echo "Updating shadow monitor RIs database" +for i in $contexts +do + tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` + hive -e "drop database if exists ${SHADOW}_${tmp} cascade" + hive -e "create database if not exists ${SHADOW}_${tmp}" + hive $HIVE_OPTS --database ${2}_${tmp} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_${tmp}.\1 as select * from ${2}_${tmp}.\1;/" > foo + hive -f foo +done +echo "Shadow db monitor RIs ready!" + +echo "Updating shadow monitor RIs tail database" +hive -e "drop database if exists ${SHADOW}_ris_tail cascade" +hive -e "create database if not exists ${SHADOW}_ris_tail" +hive $HIVE_OPTS --database ${2}_ris_tail -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_ris_tail.\1 as select * from ${2}_ris_tail.\1;/" > foo +hive -f foo +echo "Shadow db monitor RIs tail ready!" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index db8d39af2..5863625a1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -7,15 +7,13 @@ then fi export SOURCE=$1 -export TARGET=$2 -export SHADOW=$3 +export SHADOW=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" -impala-shell -q "invalidate metadata;" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - -echo "Impala shell finished" - -echo "Updating shadow observatory database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - -echo "Shadow db ready!" \ No newline at end of file +echo "Updating shadow database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh index 55a308c50..37671cce8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -10,7 +10,11 @@ export SOURCE=$1 export TARGET=$2 export SHADOW=$3 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + echo "Creating observatory database" -impala-shell -q "drop database if exists ${TARGET} cascade" -impala-shell -q "create database if not exists ${TARGET}" -impala-shell -d ${SOURCE} -q "show tables" --delimited | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +hive -e "drop database if exists ${TARGET} cascade" +hive -e "create database if not exists ${TARGET}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 47a6f84c2..39755d68e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -46,4 +46,8 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; \ No newline at end of file + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; + +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as +select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result +lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index cec22cd3e..132cb482e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -33,4 +33,12 @@ select * from ${stats_db_name}.dataset_refereed union all select * from ${stats_db_name}.software_refereed union all -select * from ${stats_db_name}.otherresearchproduct_refereed; \ No newline at end of file +select * from ${stats_db_name}.otherresearchproduct_refereed; + +create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as +select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, +cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class +from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids +where measures_ids.id!='views' and measures_ids.id!='downloads'; + +ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 86ead4a2c..f39ff2afd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -1,21 +1,21 @@ ------------------------------------------- --- Extra tables, mostly used by indicators -create table ${stats_db_name}.result_projectcount STORED AS PARQUET as +create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as select r.id, count(distinct p.id) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project group by r.id; -create table ${stats_db_name}.result_fundercount STORED AS PARQUET as +create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as select r.id, count(distinct p.funder) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project group by r.id; -create table ${stats_db_name}.project_resultcount STORED AS PARQUET as +create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as with rcount as ( select p.id as pid, count(distinct r.id) as `count`, r.type as type from ${stats_db_name}.project p @@ -29,17 +29,22 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; -create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; +create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; +create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; +create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; +create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; +create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; +create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; -create table ${stats_db_name}.result_instance stored as parquet as +create table if not exists ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( - select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom, + select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; -create table ${stats_db_name}.result_apc as +create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as select r.id, r.amount, r.currency from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency @@ -47,4 +52,4 @@ from ( join ${stats_db_name}.result res on res.id=r.id where r.amount is not null; -create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file +create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 1bda07629..36b34cc3c 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,5 +1,5 @@ -- Sprint 1 ---- -create table indi_pub_green_oa stored as parquet as +create table if not exists indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa from publication p left outer join ( @@ -12,9 +12,9 @@ from publication p or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp on p.id= tmp.id; -compute stats indi_pub_green_oa; +ANALYZE TABLE indi_pub_green_oa COMPUTE STATISTICS; -create table indi_pub_grey_lit stored as parquet as +create table if not exists indi_pub_grey_lit stored as parquet as select distinct p.id, coalesce(grey_lit, 0) as grey_lit from publication p left outer join ( @@ -25,9 +25,9 @@ from publication p not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; -compute stats indi_pub_grey_lit; +ANALYZE TABLE indi_pub_grey_lit COMPUTE STATISTICS; -create table indi_pub_doi_from_crossref stored as parquet as +create table if not exists indi_pub_doi_from_crossref stored as parquet as select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref from publication p left outer join @@ -36,10 +36,10 @@ from publication p where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp on tmp.id=p.id; -compute stats indi_pub_doi_from_crossref; +ANALYZE TABLE indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- -create table indi_result_has_cc_licence stored as parquet as +create table if not exists indi_result_has_cc_licence stored as parquet as select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from result r left outer join (select r.id, license.type as lic from result r @@ -47,9 +47,9 @@ from result r where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp on r.id= tmp.id; -compute stats indi_result_has_cc_licence; +ANALYZE TABLE indi_result_has_cc_licence COMPUTE STATISTICS; -create table indi_result_has_cc_licence_url stored as parquet as +create table if not exists indi_result_has_cc_licence_url stored as parquet as select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url from result r left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host @@ -58,31 +58,31 @@ from result r WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp on r.id= tmp.id; -compute stats indi_result_has_cc_licence_url; +ANALYZE TABLE indi_result_has_cc_licence_url COMPUTE STATISTICS; -create table indi_pub_has_abstract stored as parquet as -select distinct publication.id, coalesce(abstract, 1) has_abstract +create table if not exists indi_pub_has_abstract stored as parquet as +select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract from publication; -compute stats indi_pub_has_abstract; +ANALYZE TABLE indi_pub_has_abstract COMPUTE STATISTICS; -create table indi_result_with_orcid stored as parquet as +create table if not exists indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid from result r left outer join (select id, 1 as has_orcid from result_orcid) tmp on r.id= tmp.id; -compute stats indi_result_with_orcid; +ANALYZE TABLE indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- -create table indi_funded_result_with_fundref stored as parquet as +create table if not exists indi_funded_result_with_fundref stored as parquet as select distinct r.result as id, coalesce(fundref, 0) as fundref from project_results r left outer join (select distinct result, 1 as fundref from project_results where provenance='Harvested') tmp on r.result= tmp.result; -compute stats indi_funded_result_with_fundref; +ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS; -- create table indi_result_org_collab stored as parquet as -- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations @@ -92,77 +92,65 @@ compute stats indi_funded_result_with_fundref; -- -- compute stats indi_result_org_collab; -- -create table indi_result_org_collab stored as parquet as -with tmp as ( -select distinct ro.organization organization, ro.id from result_organization ro -join organization o on o.id=ro.organization where o.name is not null) -select o1.organization org1, o2.organization org2, count(o1.id) as collaborations +create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id, o.name from result_organization ro +join organization o on o.id=ro.organization where o.name is not null; + +create table if not exists indi_result_org_collab stored as parquet as +select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations from tmp as o1 -join tmp as o2 on o1.id=o2.id and o1.organization!=o2.organization -group by org1, org2; +join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name +group by o1.organization, o2.organization, o1.name, o2.name; -compute stats indi_result_org_collab; +drop table tmp purge; --- create table indi_result_org_country_collab stored as parquet as --- with tmp as --- (select o.id as id, o.country , ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where o.country <> 'UNKNOWN') --- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id and o1.country<>o2.country --- group by o1.id, o1.type,o2.country; --- --- compute stats indi_result_org_country_collab; --- -create table indi_result_org_country_collab stored as parquet as -with tmp as -(select distinct ro.organization organization, ro.id, o.country from result_organization ro -join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) -select o1.organization org1,o2.country country2, count(o1.id) as collaborations +ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS; + +create TEMPORARY TABLE tmp AS +select distinct ro.organization organization, ro.id, o.name, o.country from result_organization ro +join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_result_org_country_collab stored as parquet as +select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations from tmp as o1 join tmp as o2 on o1.id=o2.id where o1.id=o2.id and o1.country!=o2.country -group by o1.organization, o1.id, o2.country; +group by o1.organization, o1.id, o1.name, o2.country; -compute stats indi_result_org_country_collab; +drop table tmp purge; --- create table indi_result_org_collab stored as parquet as --- with tmp as --- (select o.id, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id) --- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id --- group by o1.id, o2.id, o1.type; --- --- compute stats indi_result_org_collab; --- -create table indi_project_collab_org stored as parquet as -select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations -from organization_projects as o1 - join organization_projects as o2 on o1.project=o2.project -where o1.id!=o2.id -group by o1.id, o2.id; +ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS; -compute stats indi_project_collab_org; +create TEMPORARY TABLE tmp AS +select o.id organization, o.name, ro.project as project from organization o + join organization_projects ro on o.id=ro.id where o.name is not null; -create table indi_project_collab_org_country stored as parquet as - with tmp as - (select o.id organization, o.country , ro.project as project from organization o +create table if not exists indi_project_collab_org stored as parquet as +select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations +from tmp as o1 + join tmp as o2 on o1.project=o2.project +where o1.organization<>o2.organization and o1.name<>o2.name +group by o1.name,o2.name, o1.organization, o2.organization; + +drop table tmp purge; + +ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS; + +create TEMPORARY TABLE tmp AS +select o.id organization, o.name, o.country , ro.project as project from organization o join organization_projects ro on o.id=ro.id - and o.country <> 'UNKNOWN') -select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations + and o.country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_project_collab_org_country stored as parquet as +select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations from tmp as o1 join tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.country<>o2.country -group by o1.organization, o2.country; +group by o1.organization, o2.country, o1.name; -compute stats indi_project_collab_org_country; +drop table tmp purge; -create table indi_funder_country_collab stored as parquet as +ANALYZE TABLE indi_project_collab_org_country COMPUTE STATISTICS; + +create table if not exists indi_funder_country_collab stored as parquet as with tmp as (select funder, project, country from organization_projects op join organization o on o.id=op.id join project p on p.id=op.project @@ -173,72 +161,50 @@ from tmp as f1 where f1.country<>f2.country group by f1.funder, f2.country, f1.country; -compute stats indi_funder_country_collab; --- --- create table indi_result_country_collab stored as parquet as --- with tmp as --- (select country, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where country <> 'UNKNOWN') --- select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.country<>o2.country --- group by o1.country, o2.country, o1.type; --- --- compute stats indi_result_country_collab; +ANALYZE TABLE indi_funder_country_collab COMPUTE STATISTICS; -create table indi_result_country_collab stored as parquet as -with tmp as - (select distinct country, ro.id as result from organization o +create TEMPORARY TABLE tmp AS +select distinct country, ro.id as result from organization o join result_organization ro on o.id=ro.organization - where country <> 'UNKNOWN' and o.name is not null) + where country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_result_country_collab stored as parquet as select o1.country country1, o2.country country2, count(o1.result) as collaborations from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.country<>o2.country group by o1.country, o2.country; -compute stats indi_result_country_collab; +drop table tmp purge; + +ANALYZE TABLE indi_result_country_collab COMPUTE STATISTICS; ---- Sprint 4 ---- -create table indi_pub_diamond stored as parquet as +create table if not exists indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from publication_datasources pd left outer join ( select pd.id, 1 as in_diamond_journal from publication_datasources pd join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; -compute stats indi_pub_diamond; +ANALYZE TABLE indi_pub_diamond COMPUTE STATISTICS; ---create table indi_pub_hybrid stored as parquet as ---select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid ---from publication_datasources pd --- left outer join ( --- select pd.id, 1 as is_hybrid from publication_datasources pd --- join datasource d on d.id=pd.datasource --- join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) --- and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp --- on pd.id=tmp.id; --- ---compute stats indi_pub_hybrid; - -create table indi_pub_in_transformative stored as parquet as +create table if not exists indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative from publication pd left outer join ( select pd.id, 1 as is_transformative from publication_datasources pd join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; -compute stats indi_pub_in_transformative; +ANALYZE TABLE indi_pub_in_transformative COMPUTE STATISTICS; -create table indi_pub_closed_other_open stored as parquet as +create table if not exists indi_pub_closed_other_open stored as parquet as select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri left outer join (select ri.id, 1 as pub_closed_other_open from result_instance ri @@ -248,255 +214,58 @@ select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_op (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp on tmp.id=ri.id; -compute stats indi_pub_closed_other_open; +ANALYZE TABLE indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- -create table indi_result_no_of_copies stored as parquet as +create table if not exists indi_result_no_of_copies stored as parquet as select id, count(id) as number_of_copies from result_instance group by id; -compute stats indi_result_no_of_copies; +ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- ---create table indi_pub_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION ALL --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * FROM --- (SELECT id, issn_printed as issn --- FROM datasource WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource WHERE issn_online IS NOT NULL) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM publication_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM publication_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; - ---compute stats indi_pub_gold_oa; --- ---create table indi_datasets_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION --- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * --- FROM ( --- SELECT id,issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM dataset_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; --- ---compute stats indi_datasets_gold_oa; - ---create table indi_software_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION --- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * --- FROM ( --- SELECT id,issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM software_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM software_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; --- ---compute stats indi_software_gold_oa; - ---create table indi_org_findable stored as parquet as ---with result_with_pid as ( --- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro --- join result_pids rp on rp.id=ro.id --- group by ro.organization), ---result_has_abstract as ( --- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro --- join result rp on rp.id=ro.id where rp.abstract=true --- group by ro.organization), ---allresults as ( --- select organization, count(distinct id) no_allresults from result_organization --- group by organization), ---result_with_pid_share as ( --- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share --- from allresults --- join result_with_pid on result_with_pid.organization=allresults.organization), ---result_with_abstract_share as ( --- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share --- from allresults --- join result_has_abstract on result_has_abstract.organization=allresults.organization) ---select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable ---from allresults ---join result_with_pid_share on result_with_pid_share.organization=allresults.organization ---left outer join ( --- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization; --- ---compute stats indi_org_findable; --- ---create table indi_org_openess stored as parquet as ---WITH datasets_oa as ( --- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join dataset ds on dg.id=ds.id --- WHERE dg.is_gold=1 --- group by ro.organization), ---software_oa as ( --- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join software ds on dg.id=ds.id --- WHERE dg.is_gold=1 --- group by ro.organization), ---pubs_oa as ( --- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join publication ds on dg.id=ds.id --- where dg.is_gold=1 --- group by ro.organization), ---allpubs as ( --- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro --- join publication ps on ps.id=ro.id --- group by ro.organization), ---alldatasets as ( --- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro --- join dataset ps on ps.id=ro.id --- group by ro.organization), ---allsoftware as ( --- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro --- join software ps on ps.id=ro.id --- group by ro.organization), ---allpubsshare as ( --- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs --- join pubs_oa on allpubs.organization=pubs_oa.organization), ---alldatasetssshare as ( --- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c --- from alldatasets --- join datasets_oa on alldatasets.organization=datasets_oa.organization), ---allsoftwaresshare as ( --- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s --- from allsoftware --- join software_oa on allsoftware.organization=software_oa.organization) ---select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess ---FROM allpubsshare ---left outer join ( --- select organization,c from --- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization ---left outer join ( --- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization; --- ---compute stats indi_org_openess; --- -create table indi_pub_hybrid_oa_with_cc stored as parquet as - WITH hybrid_oa AS ( - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn - FROM stats_ext.plan_s_jn - WHERE issn_print != "" - UNION ALL - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn - FROM stats_ext.plan_s_jn - WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), - issn AS ( - SELECT * - FROM ( - SELECT id, issn_printed as issn - FROM datasource - WHERE issn_printed IS NOT NULL - UNION - SELECT id,issn_online as issn - FROM datasource - WHERE issn_online IS NOT NULL ) as issn - WHERE LENGTH(issn) > 7) -SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa -FROM publication_datasources pd - LEFT OUTER JOIN ( - SELECT pd.id, 1 as is_hybrid_oa from publication_datasources pd - JOIN datasource d on d.id=pd.datasource - JOIN issn on issn.id=pd.datasource - JOIN hybrid_oa ON issn.issn = hybrid_oa.issn - JOIN indi_result_has_cc_licence cc on pd.id=cc.id - where cc.has_cc_license=1) tmp on pd.id=tmp.id; - -compute stats indi_pub_hybrid_oa_with_cc; - -create table indi_pub_downloads stored as parquet as +create table if not exists indi_pub_downloads stored as parquet as SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id order by no_downloads desc; -compute stats indi_pub_downloads; +ANALYZE TABLE indi_pub_downloads COMPUTE STATISTICS; -create table indi_pub_downloads_datasource stored as parquet as +create table if not exists indi_pub_downloads_datasource stored as parquet as SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id order by result_id; -compute stats indi_pub_downloads_datasource; +ANALYZE TABLE indi_pub_downloads_datasource COMPUTE STATISTICS; -create table indi_pub_downloads_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us - join publication on result_id=id where downloads>0 -GROUP BY result_id, `year` -order by `year` asc; +create table if not exists indi_pub_downloads_year stored as parquet as +SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads +from openaire_prod_usage_stats.usage_stats us +join publication on result_id=id where downloads>0 +GROUP BY result_id, substring(us.`date`, 1,4); -compute stats indi_pub_downloads_year; +ANALYZE TABLE indi_pub_downloads_year COMPUTE STATISTICS; -create table indi_pub_downloads_datasource_year stored as parquet as +create table if not exists indi_pub_downloads_datasource_year stored as parquet as SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us - join publication on result_id=id +join publication on result_id=id where downloads>0 -GROUP BY result_id, repository_id, `year` -order by `year` asc, result_id; +GROUP BY result_id, repository_id, substring(us.`date`, 1,4); -compute stats indi_pub_downloads_datasource_year; +ANALYZE TABLE indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- -create table indi_pub_gold_oa stored as parquet as +create table if not exists indi_pub_gold_oa stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_1 != "" UNION @@ -506,7 +275,7 @@ create table indi_pub_gold_oa stored as parquet as journal_is_oa, issn_2 as issn FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_2 != "" ), issn AS ( SELECT * @@ -518,7 +287,7 @@ create table indi_pub_gold_oa stored as parquet as datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id, issn_online as issn @@ -538,9 +307,42 @@ FROM JOIN gold_oa on issn.issn = gold_oa.issn) tmp on pd.id=tmp.id; -compute stats indi_pub_gold_oa; +ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS; -create table indi_pub_hybrid stored as parquet as +create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as + WITH hybrid_oa AS ( + SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn + FROM STATS_EXT.plan_s_jn + WHERE issn_print != "" + UNION ALL + SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn + FROM STATS_EXT.plan_s_jn + WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), + issn AS ( + SELECT * + FROM ( + SELECT id, issn_printed as issn + FROM datasource + WHERE issn_printed IS NOT NULL + UNION ALL + SELECT id,issn_online as issn + FROM datasource + WHERE issn_online IS NOT NULL ) as issn + WHERE LENGTH(issn) > 7) +SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa +FROM publication_datasources pd + LEFT OUTER JOIN ( + SELECT pd.id, 1 as is_hybrid_oa from publication_datasources pd + JOIN datasource d on d.id=pd.datasource + JOIN issn on issn.id=pd.datasource + JOIN hybrid_oa ON issn.issn = hybrid_oa.issn + JOIN indi_result_has_cc_licence cc on pd.id=cc.id + JOIN indi_pub_gold_oa ga on pd.id=ga.id + where cc.has_cc_license=1 and ga.is_gold=0) tmp on pd.id=tmp.id; + +ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; + +create table if not exists indi_pub_hybrid stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, @@ -548,7 +350,7 @@ create table indi_pub_hybrid stored as parquet as issn_1 as issn, has_apc FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_1 != "" UNION @@ -559,7 +361,7 @@ create table indi_pub_hybrid stored as parquet as issn_2 as issn, has_apc FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_2 != "" ), issn AS ( SELECT * @@ -571,7 +373,7 @@ create table indi_pub_hybrid stored as parquet as datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id, issn_online as issn @@ -591,15 +393,15 @@ from publication_datasources pd where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp on pd.id=tmp.id; -compute stats indi_pub_hybrid; +ANALYZE TABLE indi_pub_hybrid COMPUTE STATISTICS; -create table indi_org_fairness stored as parquet as +create table if not exists indi_org_fairness stored as parquet as --return results with PIDs, and rich metadata group by organization with result_fair as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id --join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization), --return all results group by organization allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro @@ -611,16 +413,16 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness; +ANALYZE TABLE indi_org_fairness COMPUTE STATISTICS; -create table indi_org_fairness_pub_pr stored as parquet as +create table if not exists indi_org_fairness_pub_pr stored as parquet as with result_fair as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join publication p on p.id=ro.id join indi_pub_doi_from_crossref dc on dc.id=p.id join indi_pub_grey_lit gl on gl.id=p.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0 group by ro.organization), allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro @@ -632,150 +434,180 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness_pub_pr; +ANALYZE TABLE indi_org_fairness_pub_pr COMPUTE STATISTICS; -create table indi_org_fairness_pub_year stored as parquet as - with result_fair as - (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro - join publication p on p.id=ro.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 - group by ro.organization, year), - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id +CREATE TEMPORARY table result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro + join result p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as select year, organization, count(distinct ro.id) no_allresults from result_organization ro + join result p on p.id=ro.id where cast(year as int)>2003 - group by organization, year) + group by organization, year; + +create table if not exists indi_org_fairness_pub_year stored as parquet as select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -compute stats indi_org_fairness_pub_year; +DROP table result_fair purge; +DROP table allresults purge; -create table indi_org_fairness_pub as -with result_fair as - (select ro.organization organization, count(distinct ro.id) no_result_fair - from result_organization ro - join publication p on p.id=ro.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) - and (authors>0) and cast(year as int)>2003 - group by ro.organization), - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id - where cast(year as int)>2003 - group by organization) +ANALYZE TABLE indi_org_fairness_pub_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_fair as + select ro.organization organization, count(distinct ro.id) no_result_fair + from result_organization ro + join result p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) + and (authors>0) and cast(year as int)>2003 + group by ro.organization; + +CREATE TEMPORARY TABLE allresults as + select organization, count(distinct ro.id) no_allresults from result_organization ro + join result p on p.id=ro.id + where cast(year as int)>2003 + group by organization; + +create table if not exists indi_org_fairness_pub as select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization; +from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness_pub; +DROP table result_fair purge; +DROP table allresults purge; -create table indi_org_fairness_year stored as parquet as - with result_fair as - (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro +ANALYZE TABLE indi_org_fairness_pub COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 - group by ro.organization, year), - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as + select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int)>2003 - group by organization, year) ---return results_fair/all_results -select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; + group by organization, year; -compute stats indi_org_fairness_year; +create table if not exists indi_org_fairness_year stored as parquet as + select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness + from allresults + join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -create table indi_org_findable_year stored as parquet as ---return results with PIDs group by organization,year - with result_with_pid as - (select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +DROP table result_fair purge; +DROP table allresults purge; + +ANALYZE TABLE indi_org_fairness_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_with_pid as + select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id where cast(year as int) >2003 - group by ro.organization, year), ---return all results group by organization,year - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as + select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int) >2003 - group by organization, year) ---return results_with_pid/all_results + group by organization, year; + +create table if not exists indi_org_findable_year stored as parquet as select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year; -compute stats indi_org_findable_year; +DROP table result_with_pid purge; +DROP table allresults purge; -create table indi_org_findable stored as parquet as ---return results with PIDs group by organization - with result_with_pid as - (select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +ANALYZE TABLE indi_org_findable_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_with_pid as +select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id where cast(year as int) >2003 - group by ro.organization), ---return all results group by organization - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allresults as +select organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int) >2003 - group by organization) ---return results_with_pid/all_results + group by organization; + +create table if not exists indi_org_findable stored as parquet as select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization; -compute stats indi_org_findable; +DROP table result_with_pid purge; +DROP table allresults purge; -create table indi_org_openess stored as parquet as - WITH pubs_oa as ( - SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r +ANALYZE TABLE indi_org_findable COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pubs_oa as +SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - datasets_oa as ( - SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + group by ro.organization; + +CREATE TEMPORARY TABLE datasets_oa as +SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - software_oa as ( - SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r + group by ro.organization; + +CREATE TEMPORARY TABLE software_oa as +SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - allpubs as ( - SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allpubs as +SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro join publication ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - alldatasets as ( - SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE alldatasets as +SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro join dataset ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - allsoftware as ( - SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allsoftware as +SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro join software ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - allpubsshare as ( - select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization), - alldatasetssshare as ( - select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + group by ro.organization; + +CREATE TEMPORARY TABLE allpubsshare as +select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization; + +CREATE TEMPORARY TABLE alldatasetssshare as +select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization), - allsoftwaresshare as ( - select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + join datasets_oa on alldatasets.organization=datasets_oa.organization; + +CREATE TEMPORARY TABLE allsoftwaresshare as +select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s from allsoftware - join software_oa on allsoftware.organization=software_oa.organization) + join software_oa on allsoftware.organization=software_oa.organization; + +create table if not exists indi_org_openess stored as parquet as select allpubsshare.organization, - (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare left outer join (select organization,d from @@ -785,55 +617,75 @@ select allpubsshare.organization, allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization; -compute stats indi_org_openess; +DROP TABLE pubs_oa purge; +DROP TABLE datasets_oa purge; +DROP TABLE software_oa purge; +DROP TABLE allpubs purge; +DROP TABLE alldatasets purge; +DROP TABLE allsoftware purge; +DROP TABLE allpubsshare purge; +DROP TABLE alldatasetssshare purge; +DROP TABLE allsoftwaresshare purge; -create table indi_org_openess_year stored as parquet as - WITH pubs_oa as ( - SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r +ANALYZE TABLE indi_org_openess COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pubs_oa AS +SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization,r.year), - datasets_oa as ( - SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + group by ro.organization,r.year; + +CREATE TEMPORARY TABLE datasets_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization, r.year), - software_oa as ( - SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r + group by ro.organization, r.year; + +CREATE TEMPORARY TABLE software_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization, r.year), - allpubs as ( - SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + group by ro.organization, r.year; + +CREATE TEMPORARY TABLE allpubs as +SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro join publication p on p.id=ro.id where cast(p.year as int)>2003 - group by ro.organization, p.year), - alldatasets as ( - SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + group by ro.organization, p.year; + +CREATE TEMPORARY TABLE alldatasets as +SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro join dataset d on d.id=ro.id where cast(d.year as int)>2003 - group by ro.organization, d.year), - allsoftware as ( - SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + group by ro.organization, d.year; + +CREATE TEMPORARY TABLE allsoftware as +SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro join software s on s.id=ro.id where cast(s.year as int)>2003 - group by ro.organization, s.year), - allpubsshare as ( - select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)), - alldatasetssshare as ( - select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + group by ro.organization, s.year; + +CREATE TEMPORARY TABLE allpubsshare as +select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int); + +CREATE TEMPORARY TABLE alldatasetssshare as +select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)), - allsoftwaresshare as ( - select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int); + +CREATE TEMPORARY TABLE allsoftwaresshare as +select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s from allsoftware - join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int)) + join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int); + + +create table if not exists indi_org_openess_year stored as parquet as select allpubsshare.year, allpubsshare.organization, - (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare left outer join (select year, organization,d from @@ -843,9 +695,19 @@ select allpubsshare.year, allpubsshare.organization, allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year; -compute stats indi_org_openess_year; +DROP TABLE pubs_oa purge; +DROP TABLE datasets_oa purge; +DROP TABLE software_oa purge; +DROP TABLE allpubs purge; +DROP TABLE alldatasets purge; +DROP TABLE allsoftware purge; +DROP TABLE allpubsshare purge; +DROP TABLE alldatasetssshare purge; +DROP TABLE allsoftwaresshare purge; -create table indi_pub_has_preprint stored as parquet as +ANALYZE TABLE indi_org_openess_year COMPUTE STATISTICS; + +create table if not exists indi_pub_has_preprint stored as parquet as select distinct p.id, coalesce(has_preprint, 0) as has_preprint from publication_classifications p left outer join ( @@ -854,9 +716,9 @@ from publication_classifications p where p.type='Preprint') tmp on p.id= tmp.id; -compute stats indi_pub_has_preprint; +ANALYZE TABLE indi_pub_has_preprint COMPUTE STATISTICS; -create table indi_pub_in_subscribed stored as parquet as +create table if not exists indi_pub_in_subscribed stored as parquet as select distinct p.id, coalesce(is_subscription, 0) as is_subscription from publication p left outer join( @@ -867,9 +729,9 @@ from publication p where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp on p.id=tmp.id; -compute stats indi_pub_in_subscribed; +ANALYZE TABLE indi_pub_in_subscribed COMPUTE STATISTICS; -create table indi_result_with_pid as +create table if not exists indi_result_with_pid as select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid from result p left outer join ( @@ -877,4 +739,63 @@ from result p from result_pids p) tmp on p.id= tmp.id; -compute stats indi_result_with_pid; \ No newline at end of file +ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pub_fos_totals as +select rf.id, count(distinct lvl3) totals from result_fos rf +group by rf.id; + +create table if not exists indi_pub_interdisciplinarity as +select distinct p.id as id, coalesce(is_interdisciplinary, 0) +as is_interdisciplinary +from pub_fos_totals p +left outer join ( +select pub_fos_totals.id, 1 as is_interdisciplinary from pub_fos_totals +where totals>1) tmp on p.id=tmp.id; + +drop table pub_fos_totals purge; + +ANALYZE TABLE indi_pub_interdisciplinarity COMPUTE STATISTICS; + +create table if not exists indi_pub_bronze_oa stored as parquet as +select distinct p.id, coalesce(is_bronze_oa,0) as is_bronze_oa +from publication p +left outer join +(select p.id, 1 as is_bronze_oa from publication p +join indi_result_has_cc_licence cc on cc.id=p.id +join indi_pub_gold_oa ga on ga.id=p.id +where cc.has_cc_license=0 and ga.is_gold=0) tmp on tmp.id=p.id; + +-- create table if not exists indi_pub_bronze_oa stored as parquet as +-- WITH hybrid_oa AS ( +-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn +-- FROM STATS_EXT.plan_s_jn +-- WHERE issn_print != "" +-- UNION ALL +-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn +-- FROM STATS_EXT.plan_s_jn +-- WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), +-- issn AS ( +-- SELECT * +-- FROM ( +-- SELECT id, issn_printed as issn +-- FROM datasource +-- WHERE issn_printed IS NOT NULL +-- UNION ALL +-- SELECT id,issn_online as issn +-- FROM datasource +-- WHERE issn_online IS NOT NULL ) as issn +-- WHERE LENGTH(issn) > 7) +--SELECT DISTINCT pd.id, coalesce(is_bronze_oa, 0) as is_bronze_oa +--FROM publication_datasources pd +-- LEFT OUTER JOIN ( +-- SELECT pd.id, 1 as is_bronze_oa from publication_datasources pd +-- JOIN datasource d on d.id=pd.datasource +-- JOIN issn on issn.id=pd.datasource +-- JOIN hybrid_oa ON issn.issn = hybrid_oa.issn +-- JOIN indi_result_has_cc_licence cc on pd.id=cc.id +-- JOIN indi_pub_gold_oa ga on pd.id=ga.id +-- JOIN indi_pub_hybrid_oa_with_cc hy on hy.id=pd.id +-- where cc.has_cc_license=0 and ga.is_gold=0 and hy.is_hybrid_oa=0) tmp on pd.id=tmp.id; + +ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 98dca7129..9744d5aae 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -1,5 +1,78 @@ -drop database if exists TARGET cascade; -create database if not exists TARGET; +--drop database if exists TARGET cascade; +--create database if not exists TARGET; +-- +--create view if not exists TARGET.category as select * from SOURCE.category; +--create view if not exists TARGET.concept as select * from SOURCE.concept; +--create view if not exists TARGET.context as select * from SOURCE.context; +--create view if not exists TARGET.country as select * from SOURCE.country; +--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; +--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; +--create view if not exists TARGET.funder as select * from SOURCE.funder; +--create view if not exists TARGET.fundref as select * from SOURCE.fundref; +--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; +-- +--create table TARGET.result stored as parquet as +-- select distinct * from ( +-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) +-- union all +-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) +-- union all +-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( +-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" +-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council +-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? +-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University +-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade +-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki +-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho +-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid +-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen +-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens +-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot +-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University +-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark +-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin +-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt +-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven +-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape +-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute +-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University +-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg +-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) +-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr +-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw +-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly +-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete +-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus +-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras +-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki +-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank +-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech +-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University +-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona +-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University +-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia +-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University +-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje +-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan +-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork +-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University +-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech +-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town +-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin +-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology +-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba +-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili +-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University +-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique +-- ) )) foo; +-- +--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.concept as select * from SOURCE.concept; @@ -10,128 +83,90 @@ create view if not exists TARGET.creation_date as select * from SOURCE.creation_ create view if not exists TARGET.funder as select * from SOURCE.funder; create view if not exists TARGET.fundref as select * from SOURCE.fundref; create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; - -create table TARGET.result stored as parquet as - select distinct * from ( - select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) - union all - select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) - union all - select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( - 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" - 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council - 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? - 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University - 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade - 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki - 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho - 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid - 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen - 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens - -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot - 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University - 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark - 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin - 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt - 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven - 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape - 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute - 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University - 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg - 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) - 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr - 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw - 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly - 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete - 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus - 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras - 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki - 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank - 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech - 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University - 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona - 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University - 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia - 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University - 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje - 'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan - ) )) foo; -compute stats TARGET.result; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_citations; +ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_references_oc; +ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_citations_oc; +ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_classifications; +ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_apc; +ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_concepts; +ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_datasources; +ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_fundercount; +ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_gold; +ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_greenoa; +ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_languages; +ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_licenses; +ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; +ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_oids; +ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_organization; +ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_peerreviewed; +ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_pids; +ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_projectcount; +ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_projects; +ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_refereed; +ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_sources; +ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_topics; +ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_fos; +ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; + +create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS; create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; -compute stats TARGET.result_result; +ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; @@ -140,7 +175,7 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; -compute stats TARGET.datasource_results; +ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; -- organizations create view if not exists TARGET.organization as select * from SOURCE.organization; @@ -155,30 +190,31 @@ create view if not exists TARGET.project_oids as select * from SOURCE.project_oi create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; +create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; -compute stats TARGET.project_results; +ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; -- indicators -- Sprint 1 ---- create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_green_oa; +ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_grey_lit; +ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_doi_from_crossref; +ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence; +ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence_url; +ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_has_abstract; +ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_with_orcid; +ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_funded_result_with_fundref; +ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; @@ -187,30 +223,32 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; ---- Sprint 4 ---- create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_diamond; +ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_in_transformative; +ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_closed_other_open; +ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_no_of_copies; +ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid_oa_with_cc; +ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; +create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS; create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads; +ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_datasource; +ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_year; +ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_datasource_year; +ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_gold_oa; +ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid; +ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; @@ -221,11 +259,12 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); - ---create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---compute stats TARGET.indi_datasets_gold_oa; ---create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---compute stats TARGET.indi_software_gold_oa; - +ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; +create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS; +create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql new file mode 100644 index 000000000..92b40405d --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql @@ -0,0 +1,15 @@ +drop database if exists TARGET cascade; +create database if not exists TARGET; + +create table TARGET.result stored as parquet as + select distinct * from ( + select * from SOURCE.result r where exists + (select 1 + from SOURCE.result_concepts rc + join SOURCE.concept conc on conc.id=rc.concept + join SOURCE.category cat on cat.id=conc.category + join SOURCE.context cont on cont.id=cat.context +-- join SOURCE.result + where rc.id=r.id and conc.category like CONTEXT) +) foo; +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql new file mode 100644 index 000000000..ef6d08d79 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql @@ -0,0 +1,15 @@ +drop database if exists TARGET cascade; +create database if not exists TARGET; + +create table TARGET.result stored as parquet as + select distinct * from ( + select * from SOURCE.result r where exists + (select 1 + from SOURCE.result_concepts rc + join SOURCE.concept conc on conc.id=rc.concept + join SOURCE.category cat on cat.id=conc.category + join SOURCE.context cont on cont.id=cat.context +-- join SOURCE.result + where rc.id=r.id and conc.category not in (CONTEXTS)) +) foo; +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql new file mode 100644 index 000000000..8d8739c74 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql @@ -0,0 +1,9 @@ +drop database if exists TARGET cascade; +create database if not exists TARGET; + +create table TARGET.result stored as parquet as + select distinct * from ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) + ) foo; + +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql new file mode 100644 index 000000000..442e623cd --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql @@ -0,0 +1,58 @@ +drop database if exists TARGET cascade; +create database if not exists TARGET; + +create table TARGET.result stored as parquet as + select distinct * from ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( + 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" + 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council + 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? + 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University + 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade + 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki + 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho + 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid + 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen + 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens + -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot + 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University + 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark + 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin + 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt + 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven + 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape + 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute + 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University + 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg + 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) + 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr + 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw + 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly + 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete + 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus + 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras + 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki + 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank + 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech + 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University + 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona + 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University + 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia + 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University + 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje + 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan + 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University + 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech + 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town + 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin + 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology + 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba + 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili + 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University + 'openorgs____::3cff625a4370d51e08624cc586138b2f', -- IMT Atlantique + 'openorgs____::c0b262bd6eab819e4c994914f9c010e2', -- National Institute of Geophysics and Volcanology + 'openorgs____::1624ff7c01bb641b91f4518539a0c28a' -- Vrije Universiteit Amsterdam + ))) foo; + +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index e24370e7d..2d7d572b3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -8,6 +8,8 @@ from ${stats_db_name}.result r group by rl.id ) rln on rln.id=r.id; +ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_country stored as parquet as select count(distinct r.id) as total, @@ -37,6 +39,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_year stored as parquet as select count(distinct r.id) as total, @@ -66,6 +70,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, @@ -95,6 +101,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, @@ -126,6 +134,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -157,6 +167,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, @@ -186,6 +198,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, @@ -215,6 +229,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, @@ -246,6 +262,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, @@ -277,6 +295,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_country stored as parquet as select count(distinct r.id) as total, @@ -308,6 +328,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_year stored as parquet as select count(distinct r.id) as total, @@ -339,6 +361,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; +ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, @@ -370,6 +394,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, @@ -401,6 +427,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -432,6 +460,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_organization stored as parquet as select count(distinct r.id) as total, @@ -463,6 +493,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, @@ -494,6 +526,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_funder stored as parquet as select count(distinct r.id) as total, @@ -527,6 +561,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; +ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, @@ -558,4 +594,6 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; \ No newline at end of file + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + +ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index c31180c14..e0522e149 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -84,4 +84,12 @@ create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; \ No newline at end of file +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; + +CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS +SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, +properties[0].value contribution, properties[1].value currency +from ${openaire_db_name}.relation r +LATERAL VIEW explode (r.properties) properties +where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' +and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 01bed17cc..248716b36 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -48,12 +48,10 @@ WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -CREATE TABLE ${stats_db_name}.dual -( - dummy CHAR(1) -); -INSERT INTO ${stats_db_name}.dual -VALUES ('X'); +CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); + +INSERT INTO ${stats_db_name}.dual VALUES ('X'); + INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) SELECT 'other', @@ -73,12 +71,8 @@ FROM ${stats_db_name}.dual WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); DROP TABLE ${stats_db_name}.dual; -UPDATE ${stats_db_name}.datasource_tmp -SET name='Other' -WHERE name = 'Unknown Repository'; -UPDATE ${stats_db_name}.datasource_tmp -SET yearofvalidation=null -WHERE yearofvalidation = '-1'; +UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; +UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language @@ -104,4 +98,4 @@ where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; \ No newline at end of file +FROM ${stats_db_name}.result_datasources; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 08d33f4e8..2ab50fb29 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + stats_db_name @@ -10,9 +10,12 @@ external_stats_db_name - stats_ext the external stats that should be added since they are not included in the graph database + + usage_stats_db_name + the usage statistics database name + stats_db_shadow_name the name of the shadow schema @@ -33,6 +36,10 @@ observatory_db_shadow_name the name of the shadow monitor db + + usage_stats_db_shadow_name + the name of the shadow usage stats db + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache update. @@ -53,6 +60,10 @@ context_api_url the base url of the context api (https://services.openaire.eu/openaire) + + hadoop_user_name + user name of the wf owner + @@ -67,10 +78,47 @@ hive.txn.timeout ${hive_timeout} + + mapred.job.queue.name + analytics + - + + + + ${wf:conf('resumeFrom') eq 'Step1'} + ${wf:conf('resumeFrom') eq 'Step2'} + ${wf:conf('resumeFrom') eq 'Step3'} + ${wf:conf('resumeFrom') eq 'Step4'} + ${wf:conf('resumeFrom') eq 'Step5'} + ${wf:conf('resumeFrom') eq 'Step6'} + ${wf:conf('resumeFrom') eq 'Step7'} + ${wf:conf('resumeFrom') eq 'Step8'} + ${wf:conf('resumeFrom') eq 'Step9'} + ${wf:conf('resumeFrom') eq 'Step10'} + ${wf:conf('resumeFrom') eq 'Step11'} + ${wf:conf('resumeFrom') eq 'Step12'} + ${wf:conf('resumeFrom') eq 'Step13'} + ${wf:conf('resumeFrom') eq 'Step14'} + ${wf:conf('resumeFrom') eq 'Step15'} + ${wf:conf('resumeFrom') eq 'Step15_5'} + ${wf:conf('resumeFrom') eq 'Contexts'} + ${wf:conf('resumeFrom') eq 'Step16-createIndicatorsTables'} + ${wf:conf('resumeFrom') eq 'Step16_1-definitions'} + ${wf:conf('resumeFrom') eq 'Step16_5'} + ${wf:conf('resumeFrom') eq 'Step19-finalize'} + ${wf:conf('resumeFrom') eq 'step20-createMonitorDB'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'} + ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'} + ${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'} + ${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'} + ${wf:conf('resumeFrom') eq 'Step24-updateCache'} + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -249,6 +297,7 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + external_stats_db_name=${external_stats_db_name} @@ -273,6 +322,7 @@ ${nameNode} indicators.sh ${stats_db_name} + ${external_stats_db_name} ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh @@ -301,7 +351,7 @@ - + ${jobTracker} @@ -324,12 +374,29 @@ ${monitor_db_name} ${monitor_db_shadow_name} ${wf:appPath()}/scripts/step20-createMonitorDB.sql + ${wf:appPath()}/scripts/step20-createMonitorDB_funded.sql + ${wf:appPath()}/scripts/step20-createMonitorDB_institutions.sql + ${wf:appPath()}/scripts/step20-createMonitorDB_RIs.sql + ${wf:appPath()}/scripts/step20-createMonitorDB_RIs_tail.sql monitor.sh + + + + + + + + + + + + + ${jobTracker} @@ -360,16 +427,53 @@ ${jobTracker} ${nameNode} observatory-post.sh - ${stats_db_name} ${observatory_db_name} ${observatory_db_shadow_name} observatory-post.sh - + - + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + + + ${stats_db_name} + ${monitor_db_name} + ${observatory_db_name} + ${external_stats_db_name} + ${usage_stats_db_name} + ${hadoop_user_name} + copyDataToImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${stats_db_name} + ${stats_db_shadow_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + ${usage_stats_db_name} + ${usage_stats_db_shadow_name} + finalizeImpalaCluster.sh + + + + + + ${jobTracker} ${nameNode} @@ -382,4 +486,4 @@ - \ No newline at end of file +