From 163b2ee2a8f27755a36de97c2f6115d27b367165 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 13 Jul 2023 15:25:00 +0300 Subject: [PATCH] Changes 1. Monitor updates 2. Bug fixes during copy to impala cluster --- .../oozie_app/config-default.xml | 30 + .../oozie_app/copyDataToImpalaCluster.sh | 75 ++ .../oozie_app/finalizeImpalaCluster.sh | 29 + .../graph/stats-monitor/oozie_app/monitor.sh | 54 ++ .../oozie_app/scripts/updateMonitorDB.sql | 138 ++++ .../oozie_app/scripts/updateMonitorDBAll.sql | 150 ++++ .../scripts/updateMonitorDB_institutions.sql | 12 + .../stats-monitor/oozie_app/workflow.xml | 110 +++ .../oozie_app/copyDataToImpalaCluster.sh | 8 +- .../stats/oozie_app/finalizeImpalaCluster.sh | 10 +- .../dhp/oa/graph/stats/oozie_app/monitor.sh | 22 +- .../graph/stats/oozie_app/scripts/step15.sql | 11 +- .../scripts/step16-createIndicatorsTables.sql | 718 +++++++++--------- .../scripts/step20-createMonitorDB.sql | 106 +-- .../scripts/step20-createMonitorDBAll.sql | 276 +++++++ .../scripts/step20-createMonitorDB_RIs.sql | 2 +- .../step20-createMonitorDB_RIs_tail.sql | 2 +- .../scripts/step20-createMonitorDB_funded.sql | 2 +- .../step20-createMonitorDB_institutions.sql | 9 +- .../scripts/step21-createObservatoryDB.sql | 38 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 18 +- 21 files changed, 1347 insertions(+), 473 deletions(-) create mode 100644 dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh create mode 100644 dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/finalizeImpalaCluster.sh create mode 100644 dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/monitor.sh create mode 100644 dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql create mode 100644 dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql create mode 100644 dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql create mode 100644 dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/config-default.xml new file mode 100644 index 000000000..b2a1322e6 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh new file mode 100644 index 000000000..1587f7152 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -0,0 +1,75 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +#export HADOOP_USER_NAME=$2 + +function copydb() { + + export HADOOP_USER="dimitris.pierrakos" + export HADOOP_USER_NAME='dimitris.pierrakos' + + db=$1 + FILE=("hive_wf_tmp_"$RANDOM) + hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/ + + # change ownership to impala +# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/ + + + # copy the databases from ocean to impala + echo "copying $db" + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/ + + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + + # drop tables from db + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`; + done + + # drop views from db + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`; + done + + # delete the database + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + + # create the databases + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + + impala-shell -q "INVALIDATE METADATA" + echo "creating schema for ${db}" + for (( k = 0; k < 5; k ++ )); do + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + done + + # load the data from /tmp in the respective tables + echo "copying data in tables and computing stats" + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; + done + + # deleting the remaining directory from hdfs +hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db +} + +MONITOR_DB=$1 +#HADOOP_USER_NAME=$2 + +copydb $MONITOR_DB'_institutions' +copydb $MONITOR_DB + diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/finalizeImpalaCluster.sh new file mode 100644 index 000000000..a7227e0c8 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/finalizeImpalaCluster.sh @@ -0,0 +1,29 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function createShadowDB() { + SOURCE=$1 + SHADOW=$2 + + # drop views from db + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "drop view $i;"`; + done + + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE"; + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}"; +# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - + impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f - +} + +MONITOR_DB=$1 +MONITOR_DB_SHADOW=$2 + +createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB'_institutions_shadow' +createShadowDB $MONITOR_DB $MONITOR_DB'_shadow' diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/monitor.sh new file mode 100644 index 000000000..4f1889c9e --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/monitor.sh @@ -0,0 +1,54 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 +export SCRIPT_PATH2=$5 +export SCRIPT_PATH2=$6 + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +echo "Getting file from " $5 +hdfs dfs -copyToLocal $5 + +echo "Getting file from " $6 +hdfs dfs -copyToLocal $6 + +#update Institutions DB +cat updateMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo +hive $HIVE_OPTS -f foo +cat updateMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo +hive $HIVE_OPTS -f foo + +echo "Hive shell finished" + +echo "Updating shadow monitor insitutions database" +hive -e "drop database if exists ${SHADOW}_institutions cascade" +hive -e "create database if not exists ${SHADOW}_institutions" +hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo +hive -f foo +echo "Shadow db monitor insitutions ready!" + +#update Monitor DB +cat updateMonitorDBAll.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo +hive $HIVE_OPTS -f foo + +echo "Hive shell finished" + +echo "Updating shadow monitor database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${2} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${2}.\1;/" > foo +hive -f foo +echo "Shadow db monitor insitutions ready!" diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql new file mode 100644 index 000000000..248b7e564 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql @@ -0,0 +1,138 @@ +INSERT INTO TARGET.result select * from TARGET.result_new; +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; + +INSERT INTO TARGET.result_citations select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; + +INSERT INTO TARGET.result_references_oc select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; + +INSERT INTO TARGET.result_classifications select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; + +INSERT INTO TARGET.result_apc select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; + +INSERT INTO TARGET.result_concepts select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; + +INSERT INTO TARGET.result_datasources select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; + +INSERT INTO TARGET.result_fundercount select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; + +INSERT INTO TARGET.result_gold select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; + +INSERT INTO TARGET.result_greenoa select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; + +INSERT INTO TARGET.result_languages select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; + +INSERT INTO TARGET.result_licenses select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; + +INSERT INTO TARGET.result_oids select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; + +INSERT INTO TARGET.result_organization select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; + +INSERT INTO TARGET.result_peerreviewed select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; + +INSERT INTO TARGET.result_pids select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; + +INSERT INTO TARGET.result_projectcount select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; + +INSERT INTO TARGET.result_projects select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; + +INSERT INTO TARGET.result_refereed select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; + +INSERT INTO TARGET.result_sources select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; + +INSERT INTO TARGET.result_topics select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; + +INSERT INTO TARGET.result_fos select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; + +INSERT INTO TARGET.result_accessroute select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS; + +create or replace view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result_new); +create or replace view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result_new); +insert into TARGET.result_result select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; +drop view TARGET.foo1; +drop view TARGET.foo2; +ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; + + +-- indicators +-- Sprint 1 ---- +INSERT INTO TARGET.indi_pub_green_oa select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_grey_lit select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_doi_from_crossref select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; +-- Sprint 2 ---- +INSERT INTO TARGET.indi_result_has_cc_licence select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_has_cc_licence_url select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_has_abstract select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_with_orcid select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; +---- Sprint 3 ---- +INSERT INTO TARGET.indi_funded_result_with_fundref select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; + +---- Sprint 4 ---- +INSERT INTO TARGET.indi_pub_diamond select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_in_transformative select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_closed_other_open select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; +---- Sprint 5 ---- +INSERT INTO TARGET.indi_result_no_of_copies select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; +---- Sprint 6 ---- +INSERT INTO TARGET.indi_pub_hybrid_oa_with_cc select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_bronze_oa select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_datasource select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_year select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_datasource_year select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; +---- Sprint 7 ---- +INSERT INTO TARGET.indi_pub_gold_oa select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_hybrid select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_has_preprint select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_in_subscribed select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_with_pid select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; +INSERT INTO TARGET.indi_impact_measures select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_interdisciplinarity select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS; + +DROP TABLE IF EXISTS TARGET.result_new; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql new file mode 100644 index 000000000..478e3824e --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql @@ -0,0 +1,150 @@ +DROP TABLE IF EXISTS TARGET.result_new; + +create table TARGET.result_new as + select distinct * from ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( + 'openorgs____::4d4051b56708688235252f1d8fddb8c1', --Iscte - Instituto Universitário de Lisboa + 'openorgs____::ab4ac74c35fa5dada770cf08e5110fab' -- Universidade Católica Portuguesa + ) )) foo; + +INSERT INTO TARGET.result select * from TARGET.result_new; +ANALYZE TABLE TARGET.result_new COMPUTE STATISTICS; + +INSERT INTO TARGET.result select * from TARGET.result_new; +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; + +INSERT INTO TARGET.result_citations select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; + +INSERT INTO TARGET.result_references_oc select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; + +INSERT INTO TARGET.result_classifications select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; + +INSERT INTO TARGET.result_apc select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; + +INSERT INTO TARGET.result_concepts select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; + +INSERT INTO TARGET.result_datasources select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; + +INSERT INTO TARGET.result_fundercount select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; + +INSERT INTO TARGET.result_gold select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; + +INSERT INTO TARGET.result_greenoa select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; + +INSERT INTO TARGET.result_languages select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; + +INSERT INTO TARGET.result_licenses select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; + +INSERT INTO TARGET.result_oids select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; + +INSERT INTO TARGET.result_organization select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; + +INSERT INTO TARGET.result_peerreviewed select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; + +INSERT INTO TARGET.result_pids select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; + +INSERT INTO TARGET.result_projectcount select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; + +INSERT INTO TARGET.result_projects select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; + +INSERT INTO TARGET.result_refereed select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; + +INSERT INTO TARGET.result_sources select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; + +INSERT INTO TARGET.result_topics select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; + +INSERT INTO TARGET.result_fos select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; + +INSERT INTO TARGET.result_accessroute select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS; + +create or replace view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result_new); +create or replace view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result_new); +insert into TARGET.result_result select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; +drop view TARGET.foo1; +drop view TARGET.foo2; +ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; + + +-- indicators +-- Sprint 1 ---- +INSERT INTO TARGET.indi_pub_green_oa select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_grey_lit select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_doi_from_crossref select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; +-- Sprint 2 ---- +INSERT INTO TARGET.indi_result_has_cc_licence select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_has_cc_licence_url select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_has_abstract select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_with_orcid select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; +---- Sprint 3 ---- +INSERT INTO TARGET.indi_funded_result_with_fundref select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; + +---- Sprint 4 ---- +INSERT INTO TARGET.indi_pub_diamond select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_in_transformative select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_closed_other_open select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; +---- Sprint 5 ---- +INSERT INTO TARGET.indi_result_no_of_copies select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; +---- Sprint 6 ---- +INSERT INTO TARGET.indi_pub_hybrid_oa_with_cc select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_bronze_oa select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_datasource select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_year select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_downloads_datasource_year select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result_new r where r.id=orig.result_id); +ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; +---- Sprint 7 ---- +INSERT INTO TARGET.indi_pub_gold_oa select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_hybrid select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_has_preprint select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_in_subscribed select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; +INSERT INTO TARGET.indi_result_with_pid select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; +INSERT INTO TARGET.indi_impact_measures select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS; +INSERT INTO TARGET.indi_pub_interdisciplinarity select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result_new r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS; + +DROP TABLE IF EXISTS TARGET.result_new; diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql new file mode 100644 index 000000000..236f3733f --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql @@ -0,0 +1,12 @@ +DROP TABLE IF EXISTS TARGET.result_new; + +create table TARGET.result_new as + select distinct * from ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( + 'openorgs____::4d4051b56708688235252f1d8fddb8c1', --Iscte - Instituto Universitário de Lisboa + 'openorgs____::ab4ac74c35fa5dada770cf08e5110fab' -- Universidade Católica Portuguesa + ) )) foo; + +INSERT INTO TARGET.result select * from TARGET.result_new; +ANALYZE TABLE TARGET.result_new COMPUTE STATISTICS; + diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/workflow.xml new file mode 100644 index 000000000..7b999a843 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/workflow.xml @@ -0,0 +1,110 @@ + + + + stats_db_name + the target stats database name + + + monitor_db_name + the target monitor db name + + + monitor_db_shadow_name + the name of the shadow monitor db + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + + hadoop_user_name + user name of the wf owner + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + hive.txn.timeout + ${hive_timeout} + + + mapred.job.queue.name + analytics + + + + + + + + ${wf:conf('resumeFrom') eq 'Step1-updateMonitorDB'} + ${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'} + ${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${jobTracker} + ${nameNode} + monitor.sh + ${stats_db_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${wf:appPath()}/scripts/updateMonitorDB_institutions.sql + ${wf:appPath()}/scripts/updateMonitorDB.sql + ${wf:appPath()}/scripts/updateMonitorDBAll.sql + monitor.sh + + + + + + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + ${monitor_db_name} + ${hadoop_user_name} + copyDataToImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${monitor_db_name} + ${monitor_db_shadow_name} + finalizeImpalaCluster.sh + + + + + + + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 87294f6e9..431978997 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -24,13 +24,13 @@ function copydb() { # drop tables from db for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; do - `impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop table $i;"`; + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`; done # drop views from db for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; do - `impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop view $i;"`; + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`; done # delete the database @@ -82,12 +82,12 @@ copydb $USAGE_STATS_DB copydb $PROD_USAGE_STATS_DB copydb $EXT_DB copydb $STATS_DB -#copydb $MONITOR_DB +copydb $MONITOR_DB copydb $OBSERVATORY_DB copydb $MONITOR_DB'_funded' copydb $MONITOR_DB'_institutions' -copydb $MONITOR_DB'_RIs_tail' +copydb $MONITOR_DB'_ris_tail' contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" for i in ${contexts} diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh index 857635b6c..86a93216c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizeImpalaCluster.sh @@ -13,7 +13,7 @@ function createShadowDB() { # drop views from db for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} --delimited -q "show tables"`; do - `impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${SHADOW} -q "drop view $i;"`; + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "drop view $i;"`; done impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE"; @@ -36,13 +36,13 @@ createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW createShadowDB USAGE_STATS_DB USAGE_STATS_DB_SHADOW -createShadowDB $MONITOR_DB'_funded' $MONITOR_DB'_funded_shadow' -createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB'_institutions_shadow' -createShadowDB $MONITOR_DB'_RIs_tail' $MONITOR_DB'_RIs_tail_shadow' +createShadowDB $MONITOR_DB'_funded' $MONITOR_DB_SHADOW'_shadow_funded' +createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB_SHADOW'_shadow_institutions' +createShadowDB $MONITOR_DB'_ris_tail' $MONITOR_DB_SHADOW'_shadow_ris_tail' contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" for i in ${contexts} do tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` - createShadowDB ${MONITOR_DB}'_'${tmp} ${MONITOR_DB}'_'${tmp}'_shadow' + createShadowDB ${MONITOR_DB}'_'${tmp} ${MONITOR_DB_SHADOW}'_shadow_'${tmp} done \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index 08f4c9232..014b19c6c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -14,6 +14,7 @@ export SCRIPT_PATH2=$5 export SCRIPT_PATH3=$6 export SCRIPT_PATH4=$7 export SCRIPT_PATH5=$8 +export SCRIPT_PATH6=$9 export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" export HADOOP_USER_NAME="oozie" @@ -33,12 +34,19 @@ hdfs dfs -copyToLocal $7 echo "Getting file from " $8 hdfs dfs -copyToLocal $8 +echo "Getting file from " $9 +hdfs dfs -copyToLocal $9 + + echo "Creating monitor database" +cat step20-createMonitorDBAll.sql | sed "s/SOURCE/openaire_prod_stats_20230707/g" | sed "s/TARGET/openaire_prod_stats_monitor_20230707/g1" > foo +hive $HIVE_OPTS -f foo + cat step20-createMonitorDB_funded.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo hive $HIVE_OPTS -f foo cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo hive $HIVE_OPTS -f foo -# + cat step20-createMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo hive $HIVE_OPTS -f foo cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo @@ -56,14 +64,20 @@ do hive $HIVE_OPTS -f foo done - -cat step20-createMonitorDB_RIs_tail.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" | sed "s/CONTEXTS/\"'knowmad::other','dh-ch::other', 'enermaps::other', 'gotriple::other', 'neanias-atmospheric::other', 'rural-digital-europe::other', 'covid-19::other', 'aurora::other', 'neanias-space::other', 'north-america-studies::other', 'north-american-studies::other', 'eutopia::other'\"/g" > foo +cat step20-createMonitorDB_RIs_tail.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_ris_tail/g1" | sed "s/CONTEXTS/\"'knowmad::other','dh-ch::other', 'enermaps::other', 'gotriple::other', 'neanias-atmospheric::other', 'rural-digital-europe::other', 'covid-19::other', 'aurora::other', 'neanias-space::other', 'north-america-studies::other', 'north-american-studies::other', 'eutopia::other'\"/g" > foo hive $HIVE_OPTS -f foo -cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" > foo +cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_ris_tail/g1" > foo hive $HIVE_OPTS -f foo echo "Hive shell finished" +echo "Updating shadow monitor all database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${2} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${2}.\1;/" > foo +hive -f foo +echo "Updated shadow monitor all database" + echo "Updating shadow monitor funded database" hive -e "drop database if exists ${SHADOW}_funded cascade" hive -e "create database if not exists ${SHADOW}_funded" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 132cb482e..75e8b001b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -37,8 +37,17 @@ select * from ${stats_db_name}.otherresearchproduct_refereed; create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, -cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class +cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids where measures_ids.id!='views' and measures_ids.id!='downloads'; ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS; + +create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as +select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, +cast(rel.properties[0].value as double) apc_amount, +rel.properties[1].value apc_currency +from ${openaire_db_name}.relation rel +join ${openaire_db_name}.organization o on o.id=rel.source +join ${openaire_db_name}.result r on r.id=rel.target +where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties) > 0; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 36b34cc3c..57c381875 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,88 +1,88 @@ -- Sprint 1 ---- -create table if not exists indi_pub_green_oa stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa -from publication p +from ${stats_db_name}.publication p left outer join ( select p.id, 1 as green_oa - from publication p - join result_instance ri on ri.id = p.id - join datasource on datasource.id = ri.hostedby + from ${stats_db_name}.publication p + join ${stats_db_name}.result_instance ri on ri.id = p.id + join ${stats_db_name}.datasource on datasource.id = ri.hostedby where datasource.type like '%Repository%' and (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp on p.id= tmp.id; -ANALYZE TABLE indi_pub_green_oa COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_green_oa COMPUTE STATISTICS; -create table if not exists indi_pub_grey_lit stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as select distinct p.id, coalesce(grey_lit, 0) as grey_lit -from publication p +from ${stats_db_name}.publication p left outer join ( select p.id, 1 as grey_lit - from publication p - join result_classifications rt on rt.id = p.id + from ${stats_db_name}.publication p + join ${stats_db_name}.result_classifications rt on rt.id = p.id where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and - not exists (select 1 from result_classifications rc where type ='Other literature type' + not exists (select 1 from ${stats_db_name}.result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; -ANALYZE TABLE indi_pub_grey_lit COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_grey_lit COMPUTE STATISTICS; -create table if not exists indi_pub_doi_from_crossref stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref -from publication p +from ${stats_db_name}.publication p left outer join - (select ri.id, 1 as doi_from_crossref from result_instance ri - join datasource d on d.id = ri.collectedfrom + (select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri + join ${stats_db_name}.datasource d on d.id = ri.collectedfrom where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp on tmp.id=p.id; -ANALYZE TABLE indi_pub_doi_from_crossref COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- -create table if not exists indi_result_has_cc_licence stored as parquet as +create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license -from result r - left outer join (select r.id, license.type as lic from result r - join result_licenses as license on license.id = r.id +from ${stats_db_name}.result r +left outer join (select r.id, license.type as lic from ${stats_db_name}.result r + join ${stats_db_name}.result_licenses as license on license.id = r.id where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp on r.id= tmp.id; -ANALYZE TABLE indi_result_has_cc_licence COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_result_has_cc_licence COMPUTE STATISTICS; -create table if not exists indi_result_has_cc_licence_url stored as parquet as +create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url -from result r +from ${stats_db_name}.result r left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host - from result r - join result_licenses as license on license.id = r.id + from ${stats_db_name}.result r + join ${stats_db_name}.result_licenses as license on license.id = r.id WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp on r.id= tmp.id; -ANALYZE TABLE indi_result_has_cc_licence_url COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_result_has_cc_licence_url COMPUTE STATISTICS; -create table if not exists indi_pub_has_abstract stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract -from publication; +from ${stats_db_name}.publication; -ANALYZE TABLE indi_pub_has_abstract COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_has_abstract COMPUTE STATISTICS; -create table if not exists indi_result_with_orcid stored as parquet as +create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid -from result r - left outer join (select id, 1 as has_orcid from result_orcid) tmp +from ${stats_db_name}.result r + left outer join (select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; -ANALYZE TABLE indi_result_with_orcid COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- -create table if not exists indi_funded_result_with_fundref stored as parquet as +create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as select distinct r.result as id, coalesce(fundref, 0) as fundref -from project_results r - left outer join (select distinct result, 1 as fundref from project_results +from ${stats_db_name}.project_results r + left outer join (select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; -ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_funded_result_with_fundref COMPUTE STATISTICS; -- create table indi_result_org_collab stored as parquet as -- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations @@ -92,68 +92,68 @@ ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS; -- -- compute stats indi_result_org_collab; -- -create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id, o.name from result_organization ro -join organization o on o.id=ro.organization where o.name is not null; +create TEMPORARY TABLE ${stats_db_name}.tmp AS SELECT ro.organization organization, ro.id, o.name from ${stats_db_name}.result_organization ro +join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null; -create table if not exists indi_result_org_collab stored as parquet as +create table if not exists ${stats_db_name}.indi_result_org_collab stored as parquet as select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations -from tmp as o1 -join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name +from ${stats_db_name}.tmp as o1 +join ${stats_db_name}.tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name group by o1.organization, o2.organization, o1.name, o2.name; -drop table tmp purge; +drop table ${stats_db_name}.tmp purge; -ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_result_org_collab COMPUTE STATISTICS; -create TEMPORARY TABLE tmp AS -select distinct ro.organization organization, ro.id, o.name, o.country from result_organization ro -join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; +create TEMPORARY TABLE ${stats_db_name}.tmp AS +select distinct ro.organization organization, ro.id, o.name, o.country from ${stats_db_name}.result_organization ro +join ${stats_db_name}.organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; -create table if not exists indi_result_org_country_collab stored as parquet as +create table if not exists ${stats_db_name}.indi_result_org_country_collab stored as parquet as select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations -from tmp as o1 join tmp as o2 on o1.id=o2.id +from ${stats_db_name}.tmp as o1 join ${stats_db_name}.tmp as o2 on o1.id=o2.id where o1.id=o2.id and o1.country!=o2.country group by o1.organization, o1.id, o1.name, o2.country; -drop table tmp purge; +drop table ${stats_db_name}.tmp purge; -ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_result_org_country_collab COMPUTE STATISTICS; -create TEMPORARY TABLE tmp AS -select o.id organization, o.name, ro.project as project from organization o - join organization_projects ro on o.id=ro.id where o.name is not null; +create TEMPORARY TABLE ${stats_db_name}.tmp AS +select o.id organization, o.name, ro.project as project from ${stats_db_name}.organization o + join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null; -create table if not exists indi_project_collab_org stored as parquet as +create table if not exists ${stats_db_name}.indi_project_collab_org stored as parquet as select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations -from tmp as o1 - join tmp as o2 on o1.project=o2.project +from ${stats_db_name}.tmp as o1 + join ${stats_db_name}.tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.name<>o2.name group by o1.name,o2.name, o1.organization, o2.organization; -drop table tmp purge; +drop table ${stats_db_name}.tmp purge; -ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_project_collab_org COMPUTE STATISTICS; -create TEMPORARY TABLE tmp AS -select o.id organization, o.name, o.country , ro.project as project from organization o - join organization_projects ro on o.id=ro.id +create TEMPORARY TABLE ${stats_db_name}.tmp AS +select o.id organization, o.name, o.country , ro.project as project from ${stats_db_name}.organization o + join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null; -create table if not exists indi_project_collab_org_country stored as parquet as +create table if not exists ${stats_db_name}.indi_project_collab_org_country stored as parquet as select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations -from tmp as o1 - join tmp as o2 on o1.project=o2.project +from ${stats_db_name}.tmp as o1 + join ${stats_db_name}.tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.country<>o2.country group by o1.organization, o2.country, o1.name; -drop table tmp purge; +drop table ${stats_db_name}.tmp purge; -ANALYZE TABLE indi_project_collab_org_country COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_project_collab_org_country COMPUTE STATISTICS; -create table if not exists indi_funder_country_collab stored as parquet as - with tmp as (select funder, project, country from organization_projects op - join organization o on o.id=op.id - join project p on p.id=op.project +create table if not exists ${stats_db_name}.indi_funder_country_collab stored as parquet as + with tmp as (select funder, project, country from ${stats_db_name}.organization_projects op + join ${stats_db_name}.organization o on o.id=op.id + join ${stats_db_name}.project p on p.id=op.project where country <> 'UNKNOWN') select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations from tmp as f1 @@ -161,104 +161,104 @@ from tmp as f1 where f1.country<>f2.country group by f1.funder, f2.country, f1.country; -ANALYZE TABLE indi_funder_country_collab COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_funder_country_collab COMPUTE STATISTICS; -create TEMPORARY TABLE tmp AS -select distinct country, ro.id as result from organization o - join result_organization ro on o.id=ro.organization +create TEMPORARY TABLE ${stats_db_name}.tmp AS +select distinct country, ro.id as result from ${stats_db_name}.organization o + join ${stats_db_name}.result_organization ro on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; -create table if not exists indi_result_country_collab stored as parquet as +create table if not exists ${stats_db_name}.indi_result_country_collab stored as parquet as select o1.country country1, o2.country country2, count(o1.result) as collaborations -from tmp as o1 - join tmp as o2 on o1.result=o2.result +from ${stats_db_name}.tmp as o1 + join ${stats_db_name}.tmp as o2 on o1.result=o2.result where o1.country<>o2.country group by o1.country, o2.country; -drop table tmp purge; +drop table ${stats_db_name}.tmp purge; -ANALYZE TABLE indi_result_country_collab COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_result_country_collab COMPUTE STATISTICS; ---- Sprint 4 ---- -create table if not exists indi_pub_diamond stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal -from publication_datasources pd +from ${stats_db_name}.publication_datasources pd left outer join ( - select pd.id, 1 as in_diamond_journal from publication_datasources pd - join datasource d on d.id=pd.datasource + select pd.id, 1 as in_diamond_journal from ${stats_db_name}.publication_datasources pd + join ${stats_db_name}.datasource d on d.id=pd.datasource join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; -ANALYZE TABLE indi_pub_diamond COMPUTE STATISTICS; +----ANALYZE TABLE ${stats_db_name}.indi_pub_diamond COMPUTE STATISTICS; -create table if not exists indi_pub_in_transformative stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative -from publication pd +from ${stats_db_name}.publication pd left outer join ( - select pd.id, 1 as is_transformative from publication_datasources pd - join datasource d on d.id=pd.datasource + select pd.id, 1 as is_transformative from ${stats_db_name}.publication_datasources pd + join ${stats_db_name}.datasource d on d.id=pd.datasource join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; -ANALYZE TABLE indi_pub_in_transformative COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_in_transformative COMPUTE STATISTICS; -create table if not exists indi_pub_closed_other_open stored as parquet as -select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri +create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as +select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from ${stats_db_name}.result_instance ri left outer join - (select ri.id, 1 as pub_closed_other_open from result_instance ri - join publication p on p.id=ri.id - join datasource d on ri.hostedby=d.id + (select ri.id, 1 as pub_closed_other_open from ${stats_db_name}.result_instance ri + join ${stats_db_name}.publication p on p.id=ri.id + join ${stats_db_name}.datasource d on ri.hostedby=d.id where d.type like '%Journal%' and ri.accessright='Closed Access' and (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp on tmp.id=ri.id; -ANALYZE TABLE indi_pub_closed_other_open COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- -create table if not exists indi_result_no_of_copies stored as parquet as -select id, count(id) as number_of_copies from result_instance group by id; +create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as +select id, count(id) as number_of_copies from ${stats_db_name}.result_instance group by id; -ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- -create table if not exists indi_pub_downloads stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats - join publication on result_id=id + join ${stats_db_name}.publication on result_id=id where downloads>0 GROUP BY result_id order by no_downloads desc; -ANALYZE TABLE indi_pub_downloads COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_downloads COMPUTE STATISTICS; -create table if not exists indi_pub_downloads_datasource stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats - join publication on result_id=id + join ${stats_db_name}.publication on result_id=id where downloads>0 GROUP BY result_id, repository_id order by result_id; -ANALYZE TABLE indi_pub_downloads_datasource COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_downloads_datasource COMPUTE STATISTICS; -create table if not exists indi_pub_downloads_year stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us -join publication on result_id=id where downloads>0 +join ${stats_db_name}.publication on result_id=id where downloads>0 GROUP BY result_id, substring(us.`date`, 1,4); -ANALYZE TABLE indi_pub_downloads_year COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_downloads_year COMPUTE STATISTICS; -create table if not exists indi_pub_downloads_datasource_year stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us -join publication on result_id=id +join ${stats_db_name}.publication on result_id=id where downloads>0 GROUP BY result_id, repository_id, substring(us.`date`, 1,4); -ANALYZE TABLE indi_pub_downloads_datasource_year COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- -create table if not exists indi_pub_gold_oa stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, @@ -284,7 +284,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as id, issn_printed as issn FROM - datasource + ${stats_db_name}.datasource WHERE issn_printed IS NOT NULL UNION ALL @@ -292,7 +292,7 @@ create table if not exists indi_pub_gold_oa stored as parquet as id, issn_online as issn FROM - datasource + ${stats_db_name}.datasource WHERE issn_online IS NOT NULL or id like '%doajarticles%') as issn WHERE @@ -300,16 +300,16 @@ create table if not exists indi_pub_gold_oa stored as parquet as SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold FROM - publication_datasources pd + ${stats_db_name}.publication_datasources pd left outer join( - select pd.id, 1 as is_gold FROM publication_datasources pd + select pd.id, 1 as is_gold FROM ${stats_db_name}.publication_datasources pd JOIN issn on issn.id=pd.datasource JOIN gold_oa on issn.issn = gold_oa.issn) tmp on pd.id=tmp.id; -ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_gold_oa COMPUTE STATISTICS; -create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as WITH hybrid_oa AS ( SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn FROM STATS_EXT.plan_s_jn @@ -322,27 +322,27 @@ create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as SELECT * FROM ( SELECT id, issn_printed as issn - FROM datasource + FROM ${stats_db_name}.datasource WHERE issn_printed IS NOT NULL UNION ALL SELECT id,issn_online as issn - FROM datasource + FROM ${stats_db_name}.datasource WHERE issn_online IS NOT NULL ) as issn WHERE LENGTH(issn) > 7) SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa -FROM publication_datasources pd +FROM ${stats_db_name}.publication_datasources pd LEFT OUTER JOIN ( - SELECT pd.id, 1 as is_hybrid_oa from publication_datasources pd - JOIN datasource d on d.id=pd.datasource + SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd + JOIN ${stats_db_name}.datasource d on d.id=pd.datasource JOIN issn on issn.id=pd.datasource JOIN hybrid_oa ON issn.issn = hybrid_oa.issn - JOIN indi_result_has_cc_licence cc on pd.id=cc.id - JOIN indi_pub_gold_oa ga on pd.id=ga.id + JOIN ${stats_db_name}.indi_result_has_cc_licence cc on pd.id=cc.id + JOIN ${stats_db_name}.indi_pub_gold_oa ga on pd.id=ga.id where cc.has_cc_license=1 and ga.is_gold=0) tmp on pd.id=tmp.id; -ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; -create table if not exists indi_pub_hybrid stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, @@ -370,7 +370,7 @@ create table if not exists indi_pub_hybrid stored as parquet as id, issn_printed as issn FROM - datasource + ${stats_db_name}.datasource WHERE issn_printed IS NOT NULL UNION ALL @@ -378,424 +378,398 @@ create table if not exists indi_pub_hybrid stored as parquet as id, issn_online as issn FROM - datasource + ${stats_db_name}.datasource WHERE issn_online IS NOT NULL or id like '%doajarticles%') as issn WHERE LENGTH(issn) > 7) select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid -from publication_datasources pd +from ${stats_db_name}.publication_datasources pd left outer join ( - select pd.id, 1 as is_hybrid from publication_datasources pd - join datasource d on d.id=pd.datasource + select pd.id, 1 as is_hybrid from ${stats_db_name}.publication_datasources pd + join ${stats_db_name}.datasource d on d.id=pd.datasource join issn on issn.id=pd.datasource join gold_oa on issn.issn=gold_oa.issn where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp on pd.id=tmp.id; -ANALYZE TABLE indi_pub_hybrid COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_hybrid COMPUTE STATISTICS; -create table if not exists indi_org_fairness stored as parquet as +create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as --return results with PIDs, and rich metadata group by organization with result_fair as - (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro - join result r on r.id=ro.id + (select ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result r on r.id=ro.id --join result_pids rp on r.id=rp.id where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization), --return all results group by organization - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro - join result r on r.id=ro.id + allresults as (select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result r on r.id=ro.id where cast(year as int)>2003 - group by organization) + group by ro.organization) --return results_fair/all_results select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization; -ANALYZE TABLE indi_org_fairness COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_org_fairness COMPUTE STATISTICS; -create table if not exists indi_org_fairness_pub_pr stored as parquet as - with result_fair as - (select ro.organization organization, count(distinct ro.id) no_result_fair - from result_organization ro - join publication p on p.id=ro.id - join indi_pub_doi_from_crossref dc on dc.id=p.id - join indi_pub_grey_lit gl on gl.id=p.id +CREATE TEMPORARY table ${stats_db_name}.result_fair as +select ro.organization organization, count(distinct ro.id) no_result_fair + from ${stats_db_name}.result_organization ro + join ${stats_db_name}.publication p on p.id=ro.id + join ${stats_db_name}.indi_pub_doi_from_crossref dc on dc.id=p.id + join ${stats_db_name}.indi_pub_grey_lit gl on gl.id=p.id where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0 - group by ro.organization), - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id + group by ro.organization; + +CREATE TEMPORARY TABLE ${stats_db_name}.allresults as +select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro + join ${stats_db_name}.publication p on p.id=ro.id where cast(year as int)>2003 - group by organization) ---return results_fair/all_results -select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization; + group by ro.organization; -ANALYZE TABLE indi_org_fairness_pub_pr COMPUTE STATISTICS; +create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as +select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness +from ${stats_db_name}.allresults ar + join ${stats_db_name}.result_fair rf on rf.organization=ar.organization; -CREATE TEMPORARY table result_fair as - select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro - join result p on p.id=ro.id +DROP table ${stats_db_name}.result_fair purge; +DROP table ${stats_db_name}.allresults purge; + +--ANALYZE TABLE ${stats_db_name}.indi_org_fairness_pub_pr COMPUTE STATISTICS; + +CREATE TEMPORARY table ${stats_db_name}.result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result p on p.id=ro.id where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization, year; -CREATE TEMPORARY TABLE allresults as select year, organization, count(distinct ro.id) no_allresults from result_organization ro - join result p on p.id=ro.id +CREATE TEMPORARY TABLE ${stats_db_name}.allresults as select year, ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result p on p.id=ro.id where cast(year as int)>2003 - group by organization, year; + group by ro.organization, year; -create table if not exists indi_org_fairness_pub_year stored as parquet as +create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; +from ${stats_db_name}.allresults + join ${stats_db_name}.result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -DROP table result_fair purge; -DROP table allresults purge; +DROP table ${stats_db_name}.result_fair purge; +DROP table ${stats_db_name}.allresults purge; -ANALYZE TABLE indi_org_fairness_pub_year COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_org_fairness_pub_year COMPUTE STATISTICS; -CREATE TEMPORARY TABLE result_fair as +CREATE TEMPORARY TABLE ${stats_db_name}.result_fair as select ro.organization organization, count(distinct ro.id) no_result_fair - from result_organization ro - join result p on p.id=ro.id + from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result p on p.id=ro.id where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization; -CREATE TEMPORARY TABLE allresults as - select organization, count(distinct ro.id) no_allresults from result_organization ro - join result p on p.id=ro.id +CREATE TEMPORARY TABLE ${stats_db_name}.allresults as + select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result p on p.id=ro.id where cast(year as int)>2003 - group by organization; + group by ro.organization; -create table if not exists indi_org_fairness_pub as -select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults join result_fair on result_fair.organization=allresults.organization; +create table if not exists ${stats_db_name}.indi_org_fairness_pub as +select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness +from ${stats_db_name}.allresults ar join ${stats_db_name}.result_fair rf +on rf.organization=ar.organization; -DROP table result_fair purge; -DROP table allresults purge; +DROP table ${stats_db_name}.result_fair purge; +DROP table ${stats_db_name}.allresults purge; -ANALYZE TABLE indi_org_fairness_pub COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_org_fairness_pub COMPUTE STATISTICS; -CREATE TEMPORARY TABLE result_fair as - select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro - join result r on r.id=ro.id - join result_pids rp on r.id=rp.id +CREATE TEMPORARY TABLE ${stats_db_name}.result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result r on r.id=ro.id + join ${stats_db_name}.result_pids rp on r.id=rp.id where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization, year; -CREATE TEMPORARY TABLE allresults as - select year, organization, count(distinct ro.id) no_allresults from result_organization ro - join result r on r.id=ro.id +CREATE TEMPORARY TABLE ${stats_db_name}.allresults as + select year, ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result r on r.id=ro.id where cast(year as int)>2003 - group by organization, year; + group by ro.organization, year; -create table if not exists indi_org_fairness_year stored as parquet as +create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness - from allresults - join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; + from ${stats_db_name}.allresults + join ${stats_db_name}.result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -DROP table result_fair purge; -DROP table allresults purge; +DROP table ${stats_db_name}.result_fair purge; +DROP table ${stats_db_name}.allresults purge; -ANALYZE TABLE indi_org_fairness_year COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_org_fairness_year COMPUTE STATISTICS; -CREATE TEMPORARY TABLE result_with_pid as - select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro - join result_pids rp on rp.id=ro.id - join result r on r.id=rp.id +CREATE TEMPORARY TABLE ${stats_db_name}.result_with_pid as + select year, ro.organization, count(distinct rp.id) no_result_with_pid from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result_pids rp on rp.id=ro.id + join ${stats_db_name}.result r on r.id=rp.id where cast(year as int) >2003 group by ro.organization, year; -CREATE TEMPORARY TABLE allresults as - select year, organization, count(distinct ro.id) no_allresults from result_organization ro - join result r on r.id=ro.id +CREATE TEMPORARY TABLE ${stats_db_name}.allresults as + select year, ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result r on r.id=ro.id where cast(year as int) >2003 - group by organization, year; + group by ro.organization, year; -create table if not exists indi_org_findable_year stored as parquet as +create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable -from allresults - join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year; +from ${stats_db_name}.allresults + join ${stats_db_name}.result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year; -DROP table result_with_pid purge; -DROP table allresults purge; +DROP table ${stats_db_name}.result_with_pid purge; +DROP table ${stats_db_name}.allresults purge; -ANALYZE TABLE indi_org_findable_year COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_org_findable_year COMPUTE STATISTICS; -CREATE TEMPORARY TABLE result_with_pid as -select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro - join result_pids rp on rp.id=ro.id - join result r on r.id=rp.id +CREATE TEMPORARY TABLE ${stats_db_name}.result_with_pid as +select ro.organization, count(distinct rp.id) no_result_with_pid from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result_pids rp on rp.id=ro.id + join ${stats_db_name}.result r on r.id=rp.id where cast(year as int) >2003 group by ro.organization; -CREATE TEMPORARY TABLE allresults as -select organization, count(distinct ro.id) no_allresults from result_organization ro - join result r on r.id=ro.id +CREATE TEMPORARY TABLE ${stats_db_name}.allresults as +select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro + join ${stats_db_name}.result r on r.id=ro.id where cast(year as int) >2003 - group by organization; + group by ro.organization; -create table if not exists indi_org_findable stored as parquet as +create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable -from allresults - join result_with_pid on result_with_pid.organization=allresults.organization; +from ${stats_db_name}.allresults + join ${stats_db_name}.result_with_pid on result_with_pid.organization=allresults.organization; -DROP table result_with_pid purge; -DROP table allresults purge; +DROP table ${stats_db_name}.result_with_pid purge; +DROP table ${stats_db_name}.allresults purge; -ANALYZE TABLE indi_org_findable COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_org_findable COMPUTE STATISTICS; -CREATE TEMPORARY TABLE pubs_oa as -SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r - join result_organization ro on ro.id=r.id - join result_instance ri on ri.id=r.id +CREATE TEMPORARY TABLE ${stats_db_name}.pubs_oa as +SELECT ro.organization, count(distinct r.id) no_oapubs FROM ${stats_db_name}.publication r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 group by ro.organization; -CREATE TEMPORARY TABLE datasets_oa as -SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r - join result_organization ro on ro.id=r.id - join result_instance ri on ri.id=r.id +CREATE TEMPORARY TABLE ${stats_db_name}.datasets_oa as +SELECT ro.organization, count(distinct r.id) no_oadatasets FROM ${stats_db_name}.dataset r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 group by ro.organization; -CREATE TEMPORARY TABLE software_oa as -SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r - join result_organization ro on ro.id=r.id - join result_instance ri on ri.id=r.id +CREATE TEMPORARY TABLE ${stats_db_name}.software_oa as +SELECT ro.organization, count(distinct r.id) no_oasoftware FROM ${stats_db_name}.software r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 group by ro.organization; -CREATE TEMPORARY TABLE allpubs as -SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro - join publication ps on ps.id=ro.id +CREATE TEMPORARY TABLE ${stats_db_name}.allpubs as +SELECT ro.organization, count(ro.id) no_allpubs FROM ${stats_db_name}.result_organization ro + join ${stats_db_name}.publication ps on ps.id=ro.id where cast(ps.year as int)>2003 group by ro.organization; -CREATE TEMPORARY TABLE alldatasets as -SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro - join dataset ps on ps.id=ro.id +CREATE TEMPORARY TABLE ${stats_db_name}.alldatasets as +SELECT ro.organization, count(ro.id) no_alldatasets FROM ${stats_db_name}.result_organization ro + join ${stats_db_name}.dataset ps on ps.id=ro.id where cast(ps.year as int)>2003 group by ro.organization; -CREATE TEMPORARY TABLE allsoftware as -SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro - join software ps on ps.id=ro.id +CREATE TEMPORARY TABLE ${stats_db_name}.allsoftware as +SELECT ro.organization, count(ro.id) no_allsoftware FROM ${stats_db_name}.result_organization ro + join ${stats_db_name}.software ps on ps.id=ro.id where cast(ps.year as int)>2003 group by ro.organization; -CREATE TEMPORARY TABLE allpubsshare as -select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization; +CREATE TEMPORARY TABLE ${stats_db_name}.allpubsshare as +select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from ${stats_db_name}.allpubs + join ${stats_db_name}.pubs_oa on allpubs.organization=pubs_oa.organization; -CREATE TEMPORARY TABLE alldatasetssshare as +CREATE TEMPORARY TABLE ${stats_db_name}.alldatasetssshare as select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d - from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization; + from ${stats_db_name}.alldatasets + join ${stats_db_name}.datasets_oa on alldatasets.organization=datasets_oa.organization; -CREATE TEMPORARY TABLE allsoftwaresshare as +CREATE TEMPORARY TABLE ${stats_db_name}.allsoftwaresshare as select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s - from allsoftware - join software_oa on allsoftware.organization=software_oa.organization; + from ${stats_db_name}.allsoftware + join ${stats_db_name}.software_oa on allsoftware.organization=software_oa.organization; -create table if not exists indi_org_openess stored as parquet as +create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as select allpubsshare.organization, (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) - org_openess FROM allpubsshare + org_openess FROM ${stats_db_name}.allpubsshare left outer join (select organization,d from - alldatasetssshare) tmp1 + ${stats_db_name}.alldatasetssshare) tmp1 on tmp1.organization=allpubsshare.organization left outer join (select organization,s from - allsoftwaresshare) tmp2 + ${stats_db_name}.allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization; -DROP TABLE pubs_oa purge; -DROP TABLE datasets_oa purge; -DROP TABLE software_oa purge; -DROP TABLE allpubs purge; -DROP TABLE alldatasets purge; -DROP TABLE allsoftware purge; -DROP TABLE allpubsshare purge; -DROP TABLE alldatasetssshare purge; -DROP TABLE allsoftwaresshare purge; +DROP TABLE ${stats_db_name}.pubs_oa purge; +DROP TABLE ${stats_db_name}.datasets_oa purge; +DROP TABLE ${stats_db_name}.software_oa purge; +DROP TABLE ${stats_db_name}.allpubs purge; +DROP TABLE ${stats_db_name}.alldatasets purge; +DROP TABLE ${stats_db_name}.allsoftware purge; +DROP TABLE ${stats_db_name}.allpubsshare purge; +DROP TABLE ${stats_db_name}.alldatasetssshare purge; +DROP TABLE ${stats_db_name}.allsoftwaresshare purge; -ANALYZE TABLE indi_org_openess COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_org_openess COMPUTE STATISTICS; -CREATE TEMPORARY TABLE pubs_oa AS -SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r - join result_organization ro on ro.id=r.id - join result_instance ri on ri.id=r.id +CREATE TEMPORARY TABLE ${stats_db_name}.pubs_oa AS +SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM ${stats_db_name}.publication r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 group by ro.organization,r.year; -CREATE TEMPORARY TABLE datasets_oa AS -SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r - join result_organization ro on ro.id=r.id - join result_instance ri on ri.id=r.id +CREATE TEMPORARY TABLE ${stats_db_name}.datasets_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM ${stats_db_name}.dataset r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 group by ro.organization, r.year; -CREATE TEMPORARY TABLE software_oa AS -SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r - join result_organization ro on ro.id=r.id - join result_instance ri on ri.id=r.id +CREATE TEMPORARY TABLE ${stats_db_name}.software_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM ${stats_db_name}.software r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 group by ro.organization, r.year; -CREATE TEMPORARY TABLE allpubs as -SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro - join publication p on p.id=ro.id where cast(p.year as int)>2003 +CREATE TEMPORARY TABLE ${stats_db_name}.allpubs as +SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM ${stats_db_name}.result_organization ro + join ${stats_db_name}.publication p on p.id=ro.id where cast(p.year as int)>2003 group by ro.organization, p.year; -CREATE TEMPORARY TABLE alldatasets as -SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro - join dataset d on d.id=ro.id where cast(d.year as int)>2003 +CREATE TEMPORARY TABLE ${stats_db_name}.alldatasets as +SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM ${stats_db_name}.result_organization ro + join ${stats_db_name}.dataset d on d.id=ro.id where cast(d.year as int)>2003 group by ro.organization, d.year; -CREATE TEMPORARY TABLE allsoftware as -SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro - join software s on s.id=ro.id where cast(s.year as int)>2003 +CREATE TEMPORARY TABLE ${stats_db_name}.allsoftware as +SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM ${stats_db_name}.result_organization ro + join ${stats_db_name}.software s on s.id=ro.id where cast(s.year as int)>2003 group by ro.organization, s.year; -CREATE TEMPORARY TABLE allpubsshare as -select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int); +CREATE TEMPORARY TABLE ${stats_db_name}.allpubsshare as +select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from ${stats_db_name}.allpubs + join ${stats_db_name}.pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int); -CREATE TEMPORARY TABLE alldatasetssshare as +CREATE TEMPORARY TABLE ${stats_db_name}.alldatasetssshare as select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d - from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int); + from ${stats_db_name}.alldatasets + join ${stats_db_name}.datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int); -CREATE TEMPORARY TABLE allsoftwaresshare as +CREATE TEMPORARY TABLE ${stats_db_name}.allsoftwaresshare as select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s - from allsoftware - join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int); + from ${stats_db_name}.allsoftware + join ${stats_db_name}.software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int); -create table if not exists indi_org_openess_year stored as parquet as +create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as select allpubsshare.year, allpubsshare.organization, (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) - org_openess FROM allpubsshare + org_openess FROM ${stats_db_name}.allpubsshare left outer join (select year, organization,d from - alldatasetssshare) tmp1 + ${stats_db_name}.alldatasetssshare) tmp1 on tmp1.organization=allpubsshare.organization and tmp1.year=allpubsshare.year left outer join (select year, organization,s from - allsoftwaresshare) tmp2 + ${stats_db_name}.allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year; -DROP TABLE pubs_oa purge; -DROP TABLE datasets_oa purge; -DROP TABLE software_oa purge; -DROP TABLE allpubs purge; -DROP TABLE alldatasets purge; -DROP TABLE allsoftware purge; -DROP TABLE allpubsshare purge; -DROP TABLE alldatasetssshare purge; -DROP TABLE allsoftwaresshare purge; +DROP TABLE ${stats_db_name}.pubs_oa purge; +DROP TABLE ${stats_db_name}.datasets_oa purge; +DROP TABLE ${stats_db_name}.software_oa purge; +DROP TABLE ${stats_db_name}.allpubs purge; +DROP TABLE ${stats_db_name}.alldatasets purge; +DROP TABLE ${stats_db_name}.allsoftware purge; +DROP TABLE ${stats_db_name}.allpubsshare purge; +DROP TABLE ${stats_db_name}.alldatasetssshare purge; +DROP TABLE ${stats_db_name}.allsoftwaresshare purge; -ANALYZE TABLE indi_org_openess_year COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_org_openess_year COMPUTE STATISTICS; -create table if not exists indi_pub_has_preprint stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as select distinct p.id, coalesce(has_preprint, 0) as has_preprint -from publication_classifications p +from ${stats_db_name}.publication_classifications p left outer join ( select p.id, 1 as has_preprint - from publication_classifications p + from ${stats_db_name}.publication_classifications p where p.type='Preprint') tmp on p.id= tmp.id; -ANALYZE TABLE indi_pub_has_preprint COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_has_preprint COMPUTE STATISTICS; -create table if not exists indi_pub_in_subscribed stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as select distinct p.id, coalesce(is_subscription, 0) as is_subscription -from publication p +from ${stats_db_name}.publication p left outer join( - select p.id, 1 as is_subscription from publication p - join indi_pub_gold_oa g on p.id=g.id - join indi_pub_hybrid h on p.id=h.id - join indi_pub_in_transformative t on p.id=t.id + select p.id, 1 as is_subscription from ${stats_db_name}.publication p + join ${stats_db_name}.indi_pub_gold_oa g on p.id=g.id + join ${stats_db_name}.indi_pub_hybrid h on p.id=h.id + join ${stats_db_name}.indi_pub_in_transformative t on p.id=t.id where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp on p.id=tmp.id; -ANALYZE TABLE indi_pub_in_subscribed COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_in_subscribed COMPUTE STATISTICS; -create table if not exists indi_result_with_pid as +create table if not exists ${stats_db_name}.indi_result_with_pid as select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid -from result p +from ${stats_db_name}.result p left outer join ( select p.id, 1 as result_with_pid - from result_pids p) tmp + from ${stats_db_name}.result_pids p) tmp on p.id= tmp.id; -ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_result_with_pid COMPUTE STATISTICS; -CREATE TEMPORARY TABLE pub_fos_totals as -select rf.id, count(distinct lvl3) totals from result_fos rf +CREATE TEMPORARY TABLE ${stats_db_name}.pub_fos_totals as +select rf.id, count(distinct lvl3) totals from ${stats_db_name}.result_fos rf group by rf.id; -create table if not exists indi_pub_interdisciplinarity as +create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity as select distinct p.id as id, coalesce(is_interdisciplinary, 0) as is_interdisciplinary -from pub_fos_totals p +from ${stats_db_name}.pub_fos_totals p left outer join ( -select pub_fos_totals.id, 1 as is_interdisciplinary from pub_fos_totals +select pub_fos_totals.id, 1 as is_interdisciplinary from ${stats_db_name}.pub_fos_totals where totals>1) tmp on p.id=tmp.id; -drop table pub_fos_totals purge; +drop table ${stats_db_name}.pub_fos_totals purge; -ANALYZE TABLE indi_pub_interdisciplinarity COMPUTE STATISTICS; +--ANALYZE TABLE ${stats_db_name}.indi_pub_interdisciplinarity COMPUTE STATISTICS; -create table if not exists indi_pub_bronze_oa stored as parquet as +create table if not exists ${stats_db_name}.indi_pub_bronze_oa stored as parquet as select distinct p.id, coalesce(is_bronze_oa,0) as is_bronze_oa -from publication p +from ${stats_db_name}.publication p left outer join -(select p.id, 1 as is_bronze_oa from publication p -join indi_result_has_cc_licence cc on cc.id=p.id -join indi_pub_gold_oa ga on ga.id=p.id +(select p.id, 1 as is_bronze_oa from ${stats_db_name}.publication p +join ${stats_db_name}.indi_result_has_cc_licence cc on cc.id=p.id +join ${stats_db_name}.indi_pub_gold_oa ga on ga.id=p.id where cc.has_cc_license=0 and ga.is_gold=0) tmp on tmp.id=p.id; --- create table if not exists indi_pub_bronze_oa stored as parquet as --- WITH hybrid_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn --- FROM STATS_EXT.plan_s_jn --- WHERE issn_print != "" --- UNION ALL --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn --- FROM STATS_EXT.plan_s_jn --- WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), --- issn AS ( --- SELECT * --- FROM ( --- SELECT id, issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION ALL --- SELECT id,issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_bronze_oa, 0) as is_bronze_oa ---FROM publication_datasources pd --- LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_bronze_oa from publication_datasources pd --- JOIN datasource d on d.id=pd.datasource --- JOIN issn on issn.id=pd.datasource --- JOIN hybrid_oa ON issn.issn = hybrid_oa.issn --- JOIN indi_result_has_cc_licence cc on pd.id=cc.id --- JOIN indi_pub_gold_oa ga on pd.id=ga.id --- JOIN indi_pub_hybrid_oa_with_cc hy on hy.id=pd.id --- where cc.has_cc_license=0 and ga.is_gold=0 and hy.is_hybrid_oa=0) tmp on pd.id=tmp.id; - -ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS; \ No newline at end of file +--ANALYZE TABLE ${stats_db_name}.indi_pub_bronze_oa COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 9744d5aae..3eeb792c7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -90,83 +90,83 @@ create view if not exists TARGET.totalresearchersft as select * from SOURCE.tota create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; -ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS; create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; -ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; @@ -175,7 +175,7 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; -ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; -- organizations create view if not exists TARGET.organization as select * from SOURCE.organization; @@ -193,28 +193,28 @@ create view if not exists TARGET.project_classification as select * from SOURCE. create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; -ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; -- indicators -- Sprint 1 ---- create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; @@ -223,32 +223,32 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; ---- Sprint 4 ---- create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS; create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; @@ -259,12 +259,14 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS; create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id); -ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS; +--ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS; +create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql new file mode 100644 index 000000000..a59791084 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -0,0 +1,276 @@ +drop database if exists TARGET cascade; +create database if not exists TARGET; + +create view if not exists TARGET.category as select * from SOURCE.category; +create view if not exists TARGET.concept as select * from SOURCE.concept; +create view if not exists TARGET.context as select * from SOURCE.context; +create view if not exists TARGET.country as select * from SOURCE.country; +create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; +create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; +create view if not exists TARGET.funder as select * from SOURCE.funder; +create view if not exists TARGET.fundref as select * from SOURCE.fundref; +create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; + +create table TARGET.result stored as parquet as + select distinct * from ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) + union all + select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) + union all + select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( + 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" + 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council + 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? + 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University + 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade + 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki + 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho + 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid + 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen + 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens + -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot + 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University + 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark + 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin + 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt + 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven + 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape + 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute + 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University + 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg + 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) + 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr + 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw + 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly + 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete + 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus + 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras + 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki + 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank + 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech + 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University + 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona + 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University + 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia + 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University + 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje + 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan + 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University + 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech + 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town + 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin + 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology + 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba + 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili + 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University + 'openorgs____::3cff625a4370d51e08624cc586138b2f', -- IMT Atlantique + 'openorgs____::c0b262bd6eab819e4c994914f9c010e2', -- National Institute of Geophysics and Volcanology + 'openorgs____::1624ff7c01bb641b91f4518539a0c28a', -- Vrije Universiteit Amsterdam + 'openorgs____::4d4051b56708688235252f1d8fddb8c1', --Iscte - Instituto Universitário de Lisboa + 'openorgs____::ab4ac74c35fa5dada770cf08e5110fab' -- Universidade Católica Portuguesa + ) )) foo; + +--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; + +create view if not exists TARGET.category as select * from SOURCE.category; +create view if not exists TARGET.concept as select * from SOURCE.concept; +create view if not exists TARGET.context as select * from SOURCE.context; +create view if not exists TARGET.country as select * from SOURCE.country; +create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; +create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; +create view if not exists TARGET.funder as select * from SOURCE.funder; +create view if not exists TARGET.fundref as select * from SOURCE.fundref; +create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; + +create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; + +create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; + +create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; + +create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; + +create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; + +create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; + +create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; + +create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; + +create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; + +create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; + +create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; + +create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; + +create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; +--ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; + +create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; + +create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; + +create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; + +create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; + +create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; + +create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; + +create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; + +create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; + +create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; + +create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; + +create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS; + +create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); +create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); +create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; +drop view TARGET.foo1; +drop view TARGET.foo2; +--ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; + +-- datasources +create view if not exists TARGET.datasource as select * from SOURCE.datasource; +create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; +create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations; +create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; + +create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; +--ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; + +-- organizations +create view if not exists TARGET.organization as select * from SOURCE.organization; +create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources; +create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids; +create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects; +create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources; + +-- projects +create view if not exists TARGET.project as select * from SOURCE.project; +create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; +create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; +create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; +create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; +create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; + +create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; +--ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; + +-- indicators +-- Sprint 1 ---- +create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; +create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; +create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; +-- Sprint 2 ---- +create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; +create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; +create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; +create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; +---- Sprint 3 ---- +create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; +create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; +create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; +create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; +create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country; +create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab; +create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; +---- Sprint 4 ---- +create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; +create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; +create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; +---- Sprint 5 ---- +create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; +---- Sprint 6 ---- +create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; +create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS; +create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +--ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; +create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +--ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; +create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +--ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; +create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +--ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; +---- Sprint 7 ---- +create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; +create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; +create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; +create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; +create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; +create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub; +create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year; +create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year; +create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; +create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; +create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; +create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; +create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; +create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; +create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS; +create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS; +create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql index 92b40405d..9a9407c2d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql @@ -12,4 +12,4 @@ create table TARGET.result stored as parquet as -- join SOURCE.result where rc.id=r.id and conc.category like CONTEXT) ) foo; -ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file +--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql index ef6d08d79..bad18efde 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql @@ -12,4 +12,4 @@ create table TARGET.result stored as parquet as -- join SOURCE.result where rc.id=r.id and conc.category not in (CONTEXTS)) ) foo; -ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file +--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql index 8d8739c74..b8d3c0242 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql @@ -6,4 +6,4 @@ create table TARGET.result stored as parquet as select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) ) foo; -ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file +--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql index 442e623cd..1f75c3cd1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql @@ -42,7 +42,7 @@ create table TARGET.result stored as parquet as 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork - 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University + 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin @@ -52,7 +52,10 @@ create table TARGET.result stored as parquet as 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University 'openorgs____::3cff625a4370d51e08624cc586138b2f', -- IMT Atlantique 'openorgs____::c0b262bd6eab819e4c994914f9c010e2', -- National Institute of Geophysics and Volcanology - 'openorgs____::1624ff7c01bb641b91f4518539a0c28a' -- Vrije Universiteit Amsterdam + 'openorgs____::1624ff7c01bb641b91f4518539a0c28a', -- Vrije Universiteit Amsterdam + 'openorgs____::4d4051b56708688235252f1d8fddb8c1', --Iscte - Instituto Universitário de Lisboa + 'openorgs____::ab4ac74c35fa5dada770cf08e5110fab' -- Universidade Católica Portuguesa + ))) foo; -ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file +--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 2d7d572b3..b7e421813 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -8,7 +8,7 @@ from ${stats_db_name}.result r group by rl.id ) rln on rln.id=r.id; -ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_country stored as parquet as select @@ -39,7 +39,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_year stored as parquet as select @@ -70,7 +70,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as select @@ -101,7 +101,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as select @@ -134,7 +134,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select @@ -167,7 +167,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select @@ -198,7 +198,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select @@ -229,7 +229,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select @@ -262,7 +262,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select @@ -295,7 +295,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_country stored as parquet as select @@ -328,7 +328,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_year stored as parquet as select @@ -361,7 +361,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; -ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_year_country stored as parquet as select @@ -394,7 +394,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select @@ -427,7 +427,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; -ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select @@ -460,7 +460,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_organization stored as parquet as select @@ -493,7 +493,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; -ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select @@ -526,7 +526,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_funder stored as parquet as select @@ -561,7 +561,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; -ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS; +--ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select @@ -596,4 +596,4 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; -ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS; \ No newline at end of file +--ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 2ab50fb29..c03520e48 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -317,15 +317,12 @@ - - ${jobTracker} - ${nameNode} - indicators.sh - ${stats_db_name} - ${external_stats_db_name} - ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql - indicators.sh - + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + external_stats_db_name=${external_stats_db_name} + @@ -378,6 +375,7 @@ ${wf:appPath()}/scripts/step20-createMonitorDB_institutions.sql ${wf:appPath()}/scripts/step20-createMonitorDB_RIs.sql ${wf:appPath()}/scripts/step20-createMonitorDB_RIs_tail.sql + ${wf:appPath()}/scripts/step20-createMonitorDBAll.sql monitor.sh @@ -469,7 +467,7 @@ ${usage_stats_db_shadow_name} finalizeImpalaCluster.sh - +