From dcb958e1467bf53761d826d74e7bc107f6ab6d91 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 4 Jan 2023 11:39:01 +0200 Subject: [PATCH] Changes to execute the stats wf only in hive --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 4 +- .../oa/graph/stats/oozie_app/finalizedb.sh | 6 +- .../oa/graph/stats/oozie_app/indicators.sh | 4 +- .../dhp/oa/graph/stats/oozie_app/monitor.sh | 15 +- .../graph/stats/oozie_app/observatory-post.sh | 2 +- .../stats/oozie_app/scripts/step15_5.sql | 11 +- .../scripts/step16-createIndicatorsTables.sql | 693 +++++++----------- .../scripts/step20-createMonitorDB.sql | 105 +-- .../scripts/step21-createObservatoryDB.sql | 40 +- .../graph/stats/oozie_app/scripts/step8.sql | 2 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 144 ++-- 11 files changed, 472 insertions(+), 554 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index 0ce57e095..e152eb1ee 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -31,8 +31,8 @@ hdfs dfs -copyFromLocal categories.csv ${TMP} hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -chmod -R 777 ${TMP} -export HADOOP_USER="antonis.lempesis" -export HADOOP_USER_NAME="antonis.lempesis" +export HADOOP_USER="dimitris.pierrakos" +export HADOOP_USER_NAME="dimitris.pierrakos" echo "Creating and populating impala tables" hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh index 9de472955..011cfcc28 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -8,7 +8,9 @@ fi export SOURCE=$1 export SHADOW=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" echo "Updating shadow database" -hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo -hive -f foo \ No newline at end of file +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo +hive $HIVE_OPTS -f foo \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index fd95c8514..6c76e35f2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -8,8 +8,8 @@ fi export TARGET=$1 export SCRIPT_PATH=$2 -export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450" -export HADOOP_USER="antonis.lempesis" +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index a4e7eec57..25095f4d3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -11,10 +11,15 @@ export TARGET=$2 export SHADOW=$3 export SCRIPT_PATH=$4 -echo "Getting file from " $4 -hdfs dfs -copyToLocal $4 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $SCRIPT_PATH +hdfs dfs -copyToLocal $SCRIPT_PATH + echo "Creating monitor database" -cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo -hive -f foo -echo "Impala shell finished" \ No newline at end of file +#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo +hive $HIVE_OPTS -f foo +echo "Hive shell finished" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index 12315c9e8..fafafe59a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -12,4 +12,4 @@ export SHADOW=$3 hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive -f foo -echo "Impala shell finished" \ No newline at end of file +echo "Hive shell finished" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 9b2630286..1ae856355 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -29,6 +29,13 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; +create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; +create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; +create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; +create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; +create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; +create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; + create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( @@ -43,4 +50,6 @@ from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id -where r.amount is not null; \ No newline at end of file +where r.amount is not null; + +create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 1bda07629..ac4d4202a 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,5 +1,5 @@ -- Sprint 1 ---- -create table indi_pub_green_oa stored as parquet as +create table if not exists indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa from publication p left outer join ( @@ -12,9 +12,9 @@ from publication p or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp on p.id= tmp.id; -compute stats indi_pub_green_oa; +ANALYZE TABLE indi_pub_green_oa COMPUTE STATISTICS; -create table indi_pub_grey_lit stored as parquet as +create table if not exists indi_pub_grey_lit stored as parquet as select distinct p.id, coalesce(grey_lit, 0) as grey_lit from publication p left outer join ( @@ -25,9 +25,9 @@ from publication p not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; -compute stats indi_pub_grey_lit; +ANALYZE TABLE indi_pub_grey_lit COMPUTE STATISTICS; -create table indi_pub_doi_from_crossref stored as parquet as +create table if not exists indi_pub_doi_from_crossref stored as parquet as select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref from publication p left outer join @@ -36,10 +36,10 @@ from publication p where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp on tmp.id=p.id; -compute stats indi_pub_doi_from_crossref; +ANALYZE TABLE indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- -create table indi_result_has_cc_licence stored as parquet as +create table if not exists indi_result_has_cc_licence stored as parquet as select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from result r left outer join (select r.id, license.type as lic from result r @@ -47,9 +47,9 @@ from result r where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp on r.id= tmp.id; -compute stats indi_result_has_cc_licence; +ANALYZE TABLE indi_result_has_cc_licence COMPUTE STATISTICS; -create table indi_result_has_cc_licence_url stored as parquet as +create table if not exists indi_result_has_cc_licence_url stored as parquet as select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url from result r left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host @@ -58,31 +58,31 @@ from result r WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp on r.id= tmp.id; -compute stats indi_result_has_cc_licence_url; +ANALYZE TABLE indi_result_has_cc_licence_url COMPUTE STATISTICS; -create table indi_pub_has_abstract stored as parquet as -select distinct publication.id, coalesce(abstract, 1) has_abstract +create table if not exists indi_pub_has_abstract stored as parquet as +select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract from publication; -compute stats indi_pub_has_abstract; +ANALYZE TABLE indi_pub_has_abstract COMPUTE STATISTICS; -create table indi_result_with_orcid stored as parquet as +create table if not exists indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid from result r left outer join (select id, 1 as has_orcid from result_orcid) tmp on r.id= tmp.id; -compute stats indi_result_with_orcid; +ANALYZE TABLE indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- -create table indi_funded_result_with_fundref stored as parquet as +create table if not exists indi_funded_result_with_fundref stored as parquet as select distinct r.result as id, coalesce(fundref, 0) as fundref from project_results r left outer join (select distinct result, 1 as fundref from project_results where provenance='Harvested') tmp on r.result= tmp.result; -compute stats indi_funded_result_with_fundref; +ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS; -- create table indi_result_org_collab stored as parquet as -- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations @@ -92,77 +92,59 @@ compute stats indi_funded_result_with_fundref; -- -- compute stats indi_result_org_collab; -- -create table indi_result_org_collab stored as parquet as -with tmp as ( -select distinct ro.organization organization, ro.id from result_organization ro -join organization o on o.id=ro.organization where o.name is not null) +create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id from result_organization ro +join organization o on o.id=ro.organization where o.name is not null; + +create table if not exists indi_result_org_collab stored as parquet as select o1.organization org1, o2.organization org2, count(o1.id) as collaborations from tmp as o1 -join tmp as o2 on o1.id=o2.id and o1.organization!=o2.organization -group by org1, org2; +join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization +group by o1.organization, o2.organization; -compute stats indi_result_org_collab; +drop table tmp purge; --- create table indi_result_org_country_collab stored as parquet as --- with tmp as --- (select o.id as id, o.country , ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where o.country <> 'UNKNOWN') --- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id and o1.country<>o2.country --- group by o1.id, o1.type,o2.country; --- --- compute stats indi_result_org_country_collab; --- -create table indi_result_org_country_collab stored as parquet as -with tmp as -(select distinct ro.organization organization, ro.id, o.country from result_organization ro -join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) +ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS; + +create TEMPORARY TABLE tmp AS +select distinct ro.organization organization, ro.id, o.country from result_organization ro +join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_result_org_country_collab stored as parquet as select o1.organization org1,o2.country country2, count(o1.id) as collaborations from tmp as o1 join tmp as o2 on o1.id=o2.id where o1.id=o2.id and o1.country!=o2.country group by o1.organization, o1.id, o2.country; -compute stats indi_result_org_country_collab; +drop table tmp purge; --- create table indi_result_org_collab stored as parquet as --- with tmp as --- (select o.id, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id) --- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id --- group by o1.id, o2.id, o1.type; --- --- compute stats indi_result_org_collab; --- -create table indi_project_collab_org stored as parquet as +ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS; + +create table if not exists indi_project_collab_org stored as parquet as select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations from organization_projects as o1 join organization_projects as o2 on o1.project=o2.project where o1.id!=o2.id group by o1.id, o2.id; -compute stats indi_project_collab_org; +ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS; -create table indi_project_collab_org_country stored as parquet as - with tmp as - (select o.id organization, o.country , ro.project as project from organization o +create TEMPORARY TABLE tmp AS +select o.id organization, o.country , ro.project as project from organization o join organization_projects ro on o.id=ro.id - and o.country <> 'UNKNOWN') + and o.country <> 'UNKNOWN'; + +create table if not exists indi_project_collab_org_country stored as parquet as select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations from tmp as o1 join tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.country<>o2.country group by o1.organization, o2.country; -compute stats indi_project_collab_org_country; +drop table tmp purge; -create table indi_funder_country_collab stored as parquet as +ANALYZE TABLE indi_project_collab_org_country COMPUTE STATISTICS; + +create table if not exists indi_funder_country_collab stored as parquet as with tmp as (select funder, project, country from organization_projects op join organization o on o.id=op.id join project p on p.id=op.project @@ -173,36 +155,26 @@ from tmp as f1 where f1.country<>f2.country group by f1.funder, f2.country, f1.country; -compute stats indi_funder_country_collab; --- --- create table indi_result_country_collab stored as parquet as --- with tmp as --- (select country, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where country <> 'UNKNOWN') --- select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.country<>o2.country --- group by o1.country, o2.country, o1.type; --- --- compute stats indi_result_country_collab; +ANALYZE TABLE indi_funder_country_collab COMPUTE STATISTICS; -create table indi_result_country_collab stored as parquet as -with tmp as - (select distinct country, ro.id as result from organization o +create TEMPORARY TABLE tmp AS +select distinct country, ro.id as result from organization o join result_organization ro on o.id=ro.organization - where country <> 'UNKNOWN' and o.name is not null) + where country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_result_country_collab stored as parquet as select o1.country country1, o2.country country2, count(o1.result) as collaborations from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.country<>o2.country group by o1.country, o2.country; -compute stats indi_result_country_collab; +drop table tmp purge; + +ANALYZE TABLE indi_result_country_collab COMPUTE STATISTICS; ---- Sprint 4 ---- -create table indi_pub_diamond stored as parquet as +create table if not exists indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from publication_datasources pd left outer join ( @@ -212,21 +184,9 @@ from publication_datasources pd and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; -compute stats indi_pub_diamond; +ANALYZE TABLE indi_pub_diamond COMPUTE STATISTICS; ---create table indi_pub_hybrid stored as parquet as ---select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid ---from publication_datasources pd --- left outer join ( --- select pd.id, 1 as is_hybrid from publication_datasources pd --- join datasource d on d.id=pd.datasource --- join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) --- and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp --- on pd.id=tmp.id; --- ---compute stats indi_pub_hybrid; - -create table indi_pub_in_transformative stored as parquet as +create table if not exists indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative from publication pd left outer join ( @@ -236,9 +196,9 @@ from publication pd and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; -compute stats indi_pub_in_transformative; +ANALYZE TABLE indi_pub_in_transformative COMPUTE STATISTICS; -create table indi_pub_closed_other_open stored as parquet as +create table if not exists indi_pub_closed_other_open stored as parquet as select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri left outer join (select ri.id, 1 as pub_closed_other_open from result_instance ri @@ -248,180 +208,16 @@ select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_op (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp on tmp.id=ri.id; -compute stats indi_pub_closed_other_open; +ANALYZE TABLE indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- -create table indi_result_no_of_copies stored as parquet as +create table if not exists indi_result_no_of_copies stored as parquet as select id, count(id) as number_of_copies from result_instance group by id; -compute stats indi_result_no_of_copies; +ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- ---create table indi_pub_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION ALL --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * FROM --- (SELECT id, issn_printed as issn --- FROM datasource WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource WHERE issn_online IS NOT NULL) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM publication_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM publication_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; - ---compute stats indi_pub_gold_oa; --- ---create table indi_datasets_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION --- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * --- FROM ( --- SELECT id,issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM dataset_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; --- ---compute stats indi_datasets_gold_oa; - ---create table indi_software_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION --- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * --- FROM ( --- SELECT id,issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM software_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM software_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; --- ---compute stats indi_software_gold_oa; - ---create table indi_org_findable stored as parquet as ---with result_with_pid as ( --- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro --- join result_pids rp on rp.id=ro.id --- group by ro.organization), ---result_has_abstract as ( --- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro --- join result rp on rp.id=ro.id where rp.abstract=true --- group by ro.organization), ---allresults as ( --- select organization, count(distinct id) no_allresults from result_organization --- group by organization), ---result_with_pid_share as ( --- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share --- from allresults --- join result_with_pid on result_with_pid.organization=allresults.organization), ---result_with_abstract_share as ( --- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share --- from allresults --- join result_has_abstract on result_has_abstract.organization=allresults.organization) ---select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable ---from allresults ---join result_with_pid_share on result_with_pid_share.organization=allresults.organization ---left outer join ( --- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization; --- ---compute stats indi_org_findable; --- ---create table indi_org_openess stored as parquet as ---WITH datasets_oa as ( --- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join dataset ds on dg.id=ds.id --- WHERE dg.is_gold=1 --- group by ro.organization), ---software_oa as ( --- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join software ds on dg.id=ds.id --- WHERE dg.is_gold=1 --- group by ro.organization), ---pubs_oa as ( --- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join publication ds on dg.id=ds.id --- where dg.is_gold=1 --- group by ro.organization), ---allpubs as ( --- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro --- join publication ps on ps.id=ro.id --- group by ro.organization), ---alldatasets as ( --- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro --- join dataset ps on ps.id=ro.id --- group by ro.organization), ---allsoftware as ( --- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro --- join software ps on ps.id=ro.id --- group by ro.organization), ---allpubsshare as ( --- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs --- join pubs_oa on allpubs.organization=pubs_oa.organization), ---alldatasetssshare as ( --- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c --- from alldatasets --- join datasets_oa on alldatasets.organization=datasets_oa.organization), ---allsoftwaresshare as ( --- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s --- from allsoftware --- join software_oa on allsoftware.organization=software_oa.organization) ---select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess ---FROM allpubsshare ---left outer join ( --- select organization,c from --- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization ---left outer join ( --- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization; --- ---compute stats indi_org_openess; --- -create table indi_pub_hybrid_oa_with_cc stored as parquet as +create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as WITH hybrid_oa AS ( SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn FROM stats_ext.plan_s_jn @@ -436,7 +232,7 @@ create table indi_pub_hybrid_oa_with_cc stored as parquet as SELECT id, issn_printed as issn FROM datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id,issn_online as issn FROM datasource WHERE issn_online IS NOT NULL ) as issn @@ -451,45 +247,44 @@ FROM publication_datasources pd JOIN indi_result_has_cc_licence cc on pd.id=cc.id where cc.has_cc_license=1) tmp on pd.id=tmp.id; -compute stats indi_pub_hybrid_oa_with_cc; +ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; -create table indi_pub_downloads stored as parquet as +create table if not exists indi_pub_downloads stored as parquet as SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id order by no_downloads desc; -compute stats indi_pub_downloads; +ANALYZE TABLE indi_pub_downloads COMPUTE STATISTICS; -create table indi_pub_downloads_datasource stored as parquet as +create table if not exists indi_pub_downloads_datasource stored as parquet as SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id order by result_id; -compute stats indi_pub_downloads_datasource; +ANALYZE TABLE indi_pub_downloads_datasource COMPUTE STATISTICS; -create table indi_pub_downloads_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us - join publication on result_id=id where downloads>0 -GROUP BY result_id, `year` -order by `year` asc; +create table if not exists indi_pub_downloads_year stored as parquet as +SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads +from openaire_prod_usage_stats.usage_stats us +join publication on result_id=id where downloads>0 +GROUP BY result_id, substring(us.`date`, 1,4); -compute stats indi_pub_downloads_year; +ANALYZE TABLE indi_pub_downloads_year COMPUTE STATISTICS; -create table indi_pub_downloads_datasource_year stored as parquet as +create table if not exists indi_pub_downloads_datasource_year stored as parquet as SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us - join publication on result_id=id +join publication on result_id=id where downloads>0 -GROUP BY result_id, repository_id, `year` -order by `year` asc, result_id; +GROUP BY result_id, repository_id, substring(us.`date`, 1,4); -compute stats indi_pub_downloads_datasource_year; +ANALYZE TABLE indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- -create table indi_pub_gold_oa stored as parquet as +create table if not exists indi_pub_gold_oa stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, @@ -518,7 +313,7 @@ create table indi_pub_gold_oa stored as parquet as datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id, issn_online as issn @@ -538,9 +333,9 @@ FROM JOIN gold_oa on issn.issn = gold_oa.issn) tmp on pd.id=tmp.id; -compute stats indi_pub_gold_oa; +ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS; -create table indi_pub_hybrid stored as parquet as +create table if not exists indi_pub_hybrid stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, @@ -571,7 +366,7 @@ create table indi_pub_hybrid stored as parquet as datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id, issn_online as issn @@ -591,15 +386,15 @@ from publication_datasources pd where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp on pd.id=tmp.id; -compute stats indi_pub_hybrid; +ANALYZE TABLE indi_pub_hybrid COMPUTE STATISTICS; -create table indi_org_fairness stored as parquet as +create table if not exists indi_org_fairness stored as parquet as --return results with PIDs, and rich metadata group by organization with result_fair as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id --join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization), --return all results group by organization allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro @@ -611,16 +406,16 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness; +ANALYZE TABLE indi_org_fairness COMPUTE STATISTICS; -create table indi_org_fairness_pub_pr stored as parquet as +create table if not exists indi_org_fairness_pub_pr stored as parquet as with result_fair as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join publication p on p.id=ro.id join indi_pub_doi_from_crossref dc on dc.id=p.id join indi_pub_grey_lit gl on gl.id=p.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0 group by ro.organization), allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro @@ -632,150 +427,180 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness_pub_pr; +ANALYZE TABLE indi_org_fairness_pub_pr COMPUTE STATISTICS; -create table indi_org_fairness_pub_year stored as parquet as - with result_fair as - (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro - join publication p on p.id=ro.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 - group by ro.organization, year), - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id +CREATE TEMPORARY table result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro + join result p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as select year, organization, count(distinct ro.id) no_allresults from result_organization ro + join result p on p.id=ro.id where cast(year as int)>2003 - group by organization, year) + group by organization, year; + +create table if not exists indi_org_fairness_pub_year stored as parquet as select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -compute stats indi_org_fairness_pub_year; +DROP table result_fair purge; +DROP table allresults purge; -create table indi_org_fairness_pub as -with result_fair as - (select ro.organization organization, count(distinct ro.id) no_result_fair - from result_organization ro - join publication p on p.id=ro.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) - and (authors>0) and cast(year as int)>2003 - group by ro.organization), - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id - where cast(year as int)>2003 - group by organization) +ANALYZE TABLE indi_org_fairness_pub_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_fair as + select ro.organization organization, count(distinct ro.id) no_result_fair + from result_organization ro + join result p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) + and (authors>0) and cast(year as int)>2003 + group by ro.organization; + +CREATE TEMPORARY TABLE allresults as + select organization, count(distinct ro.id) no_allresults from result_organization ro + join result p on p.id=ro.id + where cast(year as int)>2003 + group by organization; + +create table if not exists indi_org_fairness_pub as select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization; +from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness_pub; +DROP table result_fair purge; +DROP table allresults purge; -create table indi_org_fairness_year stored as parquet as - with result_fair as - (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro +ANALYZE TABLE indi_org_fairness_pub COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 - group by ro.organization, year), - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as + select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int)>2003 - group by organization, year) ---return results_fair/all_results -select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; + group by organization, year; -compute stats indi_org_fairness_year; +create table if not exists indi_org_fairness_year stored as parquet as + select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness + from allresults + join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -create table indi_org_findable_year stored as parquet as ---return results with PIDs group by organization,year - with result_with_pid as - (select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +DROP table result_fair purge; +DROP table allresults purge; + +ANALYZE TABLE indi_org_fairness_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_with_pid as + select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id where cast(year as int) >2003 - group by ro.organization, year), ---return all results group by organization,year - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as + select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int) >2003 - group by organization, year) ---return results_with_pid/all_results + group by organization, year; + +create table if not exists indi_org_findable_year stored as parquet as select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year; -compute stats indi_org_findable_year; +DROP table result_with_pid purge; +DROP table allresults purge; -create table indi_org_findable stored as parquet as ---return results with PIDs group by organization - with result_with_pid as - (select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +ANALYZE TABLE indi_org_findable_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_with_pid as +select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id where cast(year as int) >2003 - group by ro.organization), ---return all results group by organization - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allresults as +select organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int) >2003 - group by organization) ---return results_with_pid/all_results + group by organization; + +create table if not exists indi_org_findable stored as parquet as select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization; -compute stats indi_org_findable; +DROP table result_with_pid purge; +DROP table allresults purge; -create table indi_org_openess stored as parquet as - WITH pubs_oa as ( - SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r +ANALYZE TABLE indi_org_findable COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pubs_oa as +SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - datasets_oa as ( - SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + group by ro.organization; + +CREATE TEMPORARY TABLE datasets_oa as +SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - software_oa as ( - SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r + group by ro.organization; + +CREATE TEMPORARY TABLE software_oa as +SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - allpubs as ( - SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allpubs as +SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro join publication ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - alldatasets as ( - SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE alldatasets as +SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro join dataset ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - allsoftware as ( - SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allsoftware as +SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro join software ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - allpubsshare as ( - select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization), - alldatasetssshare as ( - select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + group by ro.organization; + +CREATE TEMPORARY TABLE allpubsshare as +select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization; + +CREATE TEMPORARY TABLE alldatasetssshare as +select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization), - allsoftwaresshare as ( - select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + join datasets_oa on alldatasets.organization=datasets_oa.organization; + +CREATE TEMPORARY TABLE allsoftwaresshare as +select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s from allsoftware - join software_oa on allsoftware.organization=software_oa.organization) + join software_oa on allsoftware.organization=software_oa.organization; + +create table if not exists indi_org_openess stored as parquet as select allpubsshare.organization, - (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare left outer join (select organization,d from @@ -785,55 +610,75 @@ select allpubsshare.organization, allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization; -compute stats indi_org_openess; +DROP TABLE pubs_oa purge; +DROP TABLE datasets_oa purge; +DROP TABLE software_oa purge; +DROP TABLE allpubs purge; +DROP TABLE alldatasets purge; +DROP TABLE allsoftware purge; +DROP TABLE allpubsshare purge; +DROP TABLE alldatasetssshare purge; +DROP TABLE allsoftwaresshare purge; -create table indi_org_openess_year stored as parquet as - WITH pubs_oa as ( - SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r +ANALYZE TABLE indi_org_openess COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pubs_oa AS +SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization,r.year), - datasets_oa as ( - SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + group by ro.organization,r.year; + +CREATE TEMPORARY TABLE datasets_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization, r.year), - software_oa as ( - SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r + group by ro.organization, r.year; + +CREATE TEMPORARY TABLE software_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization, r.year), - allpubs as ( - SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + group by ro.organization, r.year; + +CREATE TEMPORARY TABLE allpubs as +SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro join publication p on p.id=ro.id where cast(p.year as int)>2003 - group by ro.organization, p.year), - alldatasets as ( - SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + group by ro.organization, p.year; + +CREATE TEMPORARY TABLE alldatasets as +SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro join dataset d on d.id=ro.id where cast(d.year as int)>2003 - group by ro.organization, d.year), - allsoftware as ( - SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + group by ro.organization, d.year; + +CREATE TEMPORARY TABLE allsoftware as +SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro join software s on s.id=ro.id where cast(s.year as int)>2003 - group by ro.organization, s.year), - allpubsshare as ( - select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)), - alldatasetssshare as ( - select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + group by ro.organization, s.year; + +CREATE TEMPORARY TABLE allpubsshare as +select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int); + +CREATE TEMPORARY TABLE alldatasetssshare as +select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)), - allsoftwaresshare as ( - select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int); + +CREATE TEMPORARY TABLE allsoftwaresshare as +select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s from allsoftware - join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int)) + join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int); + + +create table if not exists indi_org_openess_year stored as parquet as select allpubsshare.year, allpubsshare.organization, - (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare left outer join (select year, organization,d from @@ -843,9 +688,19 @@ select allpubsshare.year, allpubsshare.organization, allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year; -compute stats indi_org_openess_year; +DROP TABLE pubs_oa purge; +DROP TABLE datasets_oa purge; +DROP TABLE software_oa purge; +DROP TABLE allpubs purge; +DROP TABLE alldatasets purge; +DROP TABLE allsoftware purge; +DROP TABLE allpubsshare purge; +DROP TABLE alldatasetssshare purge; +DROP TABLE allsoftwaresshare purge; -create table indi_pub_has_preprint stored as parquet as +ANALYZE TABLE indi_org_openess_year COMPUTE STATISTICS; + +create table if not exists indi_pub_has_preprint stored as parquet as select distinct p.id, coalesce(has_preprint, 0) as has_preprint from publication_classifications p left outer join ( @@ -854,9 +709,9 @@ from publication_classifications p where p.type='Preprint') tmp on p.id= tmp.id; -compute stats indi_pub_has_preprint; +ANALYZE TABLE indi_pub_has_preprint COMPUTE STATISTICS; -create table indi_pub_in_subscribed stored as parquet as +create table if not exists indi_pub_in_subscribed stored as parquet as select distinct p.id, coalesce(is_subscription, 0) as is_subscription from publication p left outer join( @@ -867,9 +722,9 @@ from publication p where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp on p.id=tmp.id; -compute stats indi_pub_in_subscribed; +ANALYZE TABLE indi_pub_in_subscribed COMPUTE STATISTICS; -create table indi_result_with_pid as +create table if not exists indi_result_with_pid as select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid from result p left outer join ( @@ -877,4 +732,4 @@ from result p from result_pids p) tmp on p.id= tmp.id; -compute stats indi_result_with_pid; \ No newline at end of file +ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 98dca7129..195836480 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -10,6 +10,11 @@ create view if not exists TARGET.creation_date as select * from SOURCE.creation_ create view if not exists TARGET.funder as select * from SOURCE.funder; create view if not exists TARGET.fundref as select * from SOURCE.fundref; create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; create table TARGET.result stored as parquet as select distinct * from ( @@ -54,84 +59,87 @@ create table TARGET.result stored as parquet as 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje - 'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan + 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan + 'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork ) )) foo; -compute stats TARGET.result; + +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_citations; +ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_references_oc; +ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_citations_oc; +ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_classifications; +ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_apc; +ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_concepts; +ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_datasources; +ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_fundercount; +ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_gold; +ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_greenoa; +ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_languages; +ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_licenses; +ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; +ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_oids; +ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_organization; +ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_peerreviewed; +ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_pids; +ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_projectcount; +ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_projects; +ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_refereed; +ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_sources; +ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_topics; +ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_fos; +ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; -compute stats TARGET.result_result; +ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; @@ -140,7 +148,7 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; -compute stats TARGET.datasource_results; +ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; -- organizations create view if not exists TARGET.organization as select * from SOURCE.organization; @@ -157,28 +165,28 @@ create view if not exists TARGET.project_resultcount as select * from SOURCE.pro create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; -compute stats TARGET.project_results; +ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; -- indicators -- Sprint 1 ---- create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_green_oa; +ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_grey_lit; +ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_doi_from_crossref; +ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence; +ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence_url; +ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_has_abstract; +ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_with_orcid; +ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_funded_result_with_fundref; +ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; @@ -187,30 +195,30 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; ---- Sprint 4 ---- create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_diamond; +ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_in_transformative; +ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_closed_other_open; +ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_no_of_copies; +ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid_oa_with_cc; +ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads; +ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_datasource; +ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_year; +ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_datasource_year; +ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_gold_oa; +ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid; +ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; @@ -221,11 +229,12 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); - +ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; --create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_datasets_gold_oa; --create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_software_gold_oa; - diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index e24370e7d..2d7d572b3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -8,6 +8,8 @@ from ${stats_db_name}.result r group by rl.id ) rln on rln.id=r.id; +ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_country stored as parquet as select count(distinct r.id) as total, @@ -37,6 +39,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_year stored as parquet as select count(distinct r.id) as total, @@ -66,6 +70,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, @@ -95,6 +101,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, @@ -126,6 +134,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -157,6 +167,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, @@ -186,6 +198,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, @@ -215,6 +229,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, @@ -246,6 +262,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, @@ -277,6 +295,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_country stored as parquet as select count(distinct r.id) as total, @@ -308,6 +328,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_year stored as parquet as select count(distinct r.id) as total, @@ -339,6 +361,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; +ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, @@ -370,6 +394,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, @@ -401,6 +427,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -432,6 +460,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_organization stored as parquet as select count(distinct r.id) as total, @@ -463,6 +493,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, @@ -494,6 +526,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_funder stored as parquet as select count(distinct r.id) as total, @@ -527,6 +561,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; +ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, @@ -558,4 +594,6 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; \ No newline at end of file + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + +ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 131f96df9..248716b36 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -96,6 +96,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS +CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 81da11903..9976b8455 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -74,7 +74,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -302,22 +302,22 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - - - - - - - ${jobTracker} - ${nameNode} - finalizedb.sh - ${stats_db_name} - ${stats_db_shadow_name} - finalizedb.sh - + + + + + + + + + + + + + @@ -355,67 +355,67 @@ stats_db_name=${stats_db_name} observatory_db_name=${observatory_db_name} - - - - - - - ${jobTracker} - ${nameNode} - observatory-post.sh - ${stats_db_name} - ${observatory_db_name} - ${observatory_db_shadow_name} - observatory-post.sh - - - - - - - - ${jobTracker} - ${nameNode} - copyDataToImpalaCluster.sh - ${external_stats_db_name} - ${stats_db_name} - ${monitor_db_name} - ${observatory_db_name} - copyDataToImpalaCluster.sh - - - - - - - - ${jobTracker} - ${nameNode} - finalizeImpalaCluster.sh - ${stats_db_name} - ${stats_db_shadow_name} - ${monitor_db_name} - ${monitor_db_shadow_name} - ${observatory_db_name} - ${observatory_db_shadow_name} - finalizeImpalaCluster.sh - - - - - - - - ${jobTracker} - ${nameNode} - updateCache.sh - ${stats_tool_api_url} - updateCache.sh - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +