Changes to execute the stats wf only in hive

This commit is contained in:
dimitrispie 2023-01-04 11:39:01 +02:00
parent 2a4bf32d4c
commit dcb958e146
11 changed files with 472 additions and 554 deletions

View File

@ -31,8 +31,8 @@ hdfs dfs -copyFromLocal categories.csv ${TMP}
hdfs dfs -copyFromLocal concepts.csv ${TMP}
hdfs dfs -chmod -R 777 ${TMP}
export HADOOP_USER="antonis.lempesis"
export HADOOP_USER_NAME="antonis.lempesis"
export HADOOP_USER="dimitris.pierrakos"
export HADOOP_USER_NAME="dimitris.pierrakos"
echo "Creating and populating impala tables"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"

View File

@ -8,7 +8,9 @@ fi
export SOURCE=$1
export SHADOW=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Updating shadow database"
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
hive -f foo
hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
hive $HIVE_OPTS -f foo

View File

@ -8,8 +8,8 @@ fi
export TARGET=$1
export SCRIPT_PATH=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450"
export HADOOP_USER="antonis.lempesis"
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH

View File

@ -11,10 +11,15 @@ export TARGET=$2
export SHADOW=$3
export SCRIPT_PATH=$4
echo "Getting file from " $4
hdfs dfs -copyToLocal $4
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH
echo "Creating monitor database"
cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
hive -f foo
echo "Impala shell finished"
#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"

View File

@ -12,4 +12,4 @@ export SHADOW=$3
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
hive -f foo
echo "Impala shell finished"
echo "Hive shell finished"

View File

@ -29,6 +29,13 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
from rcount
group by rcount.pid;
create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
create table ${stats_db_name}.result_instance stored as parquet as
select distinct r.*
from (
@ -44,3 +51,5 @@ from (
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null;
create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset;

View File

@ -1,5 +1,5 @@
-- Sprint 1 ----
create table indi_pub_green_oa stored as parquet as
create table if not exists indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa
from publication p
left outer join (
@ -12,9 +12,9 @@ from publication p
or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp
on p.id= tmp.id;
compute stats indi_pub_green_oa;
ANALYZE TABLE indi_pub_green_oa COMPUTE STATISTICS;
create table indi_pub_grey_lit stored as parquet as
create table if not exists indi_pub_grey_lit stored as parquet as
select distinct p.id, coalesce(grey_lit, 0) as grey_lit
from publication p
left outer join (
@ -25,9 +25,9 @@ from publication p
not exists (select 1 from result_classifications rc where type ='Other literature type'
and rc.id=p.id)) tmp on p.id=tmp.id;
compute stats indi_pub_grey_lit;
ANALYZE TABLE indi_pub_grey_lit COMPUTE STATISTICS;
create table indi_pub_doi_from_crossref stored as parquet as
create table if not exists indi_pub_doi_from_crossref stored as parquet as
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from publication p
left outer join
@ -36,10 +36,10 @@ from publication p
where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp
on tmp.id=p.id;
compute stats indi_pub_doi_from_crossref;
ANALYZE TABLE indi_pub_doi_from_crossref COMPUTE STATISTICS;
-- Sprint 2 ----
create table indi_result_has_cc_licence stored as parquet as
create table if not exists indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from result r
left outer join (select r.id, license.type as lic from result r
@ -47,9 +47,9 @@ from result r
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
on r.id= tmp.id;
compute stats indi_result_has_cc_licence;
ANALYZE TABLE indi_result_has_cc_licence COMPUTE STATISTICS;
create table indi_result_has_cc_licence_url stored as parquet as
create table if not exists indi_result_has_cc_licence_url stored as parquet as
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from result r
left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host
@ -58,31 +58,31 @@ from result r
WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp
on r.id= tmp.id;
compute stats indi_result_has_cc_licence_url;
ANALYZE TABLE indi_result_has_cc_licence_url COMPUTE STATISTICS;
create table indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract
create table if not exists indi_pub_has_abstract stored as parquet as
select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
from publication;
compute stats indi_pub_has_abstract;
ANALYZE TABLE indi_pub_has_abstract COMPUTE STATISTICS;
create table indi_result_with_orcid stored as parquet as
create table if not exists indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from result r
left outer join (select id, 1 as has_orcid from result_orcid) tmp
on r.id= tmp.id;
compute stats indi_result_with_orcid;
ANALYZE TABLE indi_result_with_orcid COMPUTE STATISTICS;
---- Sprint 3 ----
create table indi_funded_result_with_fundref stored as parquet as
create table if not exists indi_funded_result_with_fundref stored as parquet as
select distinct r.result as id, coalesce(fundref, 0) as fundref
from project_results r
left outer join (select distinct result, 1 as fundref from project_results
where provenance='Harvested') tmp
on r.result= tmp.result;
compute stats indi_funded_result_with_fundref;
ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS;
-- create table indi_result_org_collab stored as parquet as
-- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations
@ -92,77 +92,59 @@ compute stats indi_funded_result_with_fundref;
--
-- compute stats indi_result_org_collab;
--
create table indi_result_org_collab stored as parquet as
with tmp as (
select distinct ro.organization organization, ro.id from result_organization ro
join organization o on o.id=ro.organization where o.name is not null)
create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id from result_organization ro
join organization o on o.id=ro.organization where o.name is not null;
create table if not exists indi_result_org_collab stored as parquet as
select o1.organization org1, o2.organization org2, count(o1.id) as collaborations
from tmp as o1
join tmp as o2 on o1.id=o2.id and o1.organization!=o2.organization
group by org1, org2;
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization
group by o1.organization, o2.organization;
compute stats indi_result_org_collab;
drop table tmp purge;
-- create table indi_result_org_country_collab stored as parquet as
-- with tmp as
-- (select o.id as id, o.country , ro.id as result,r.type from organization o
-- join result_organization ro on o.id=ro.organization
-- join result r on r.id=ro.id where o.country <> 'UNKNOWN')
-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
-- from tmp as o1
-- join tmp as o2 on o1.result=o2.result
-- where o1.id<>o2.id and o1.country<>o2.country
-- group by o1.id, o1.type,o2.country;
--
-- compute stats indi_result_org_country_collab;
--
create table indi_result_org_country_collab stored as parquet as
with tmp as
(select distinct ro.organization organization, ro.id, o.country from result_organization ro
join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null)
ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS;
create TEMPORARY TABLE tmp AS
select distinct ro.organization organization, ro.id, o.country from result_organization ro
join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null;
create table if not exists indi_result_org_country_collab stored as parquet as
select o1.organization org1,o2.country country2, count(o1.id) as collaborations
from tmp as o1 join tmp as o2 on o1.id=o2.id
where o1.id=o2.id and o1.country!=o2.country
group by o1.organization, o1.id, o2.country;
compute stats indi_result_org_country_collab;
drop table tmp purge;
-- create table indi_result_org_collab stored as parquet as
-- with tmp as
-- (select o.id, ro.id as result,r.type from organization o
-- join result_organization ro on o.id=ro.organization
-- join result r on r.id=ro.id)
-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
-- from tmp as o1
-- join tmp as o2 on o1.result=o2.result
-- where o1.id<>o2.id
-- group by o1.id, o2.id, o1.type;
--
-- compute stats indi_result_org_collab;
--
create table indi_project_collab_org stored as parquet as
ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS;
create table if not exists indi_project_collab_org stored as parquet as
select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations
from organization_projects as o1
join organization_projects as o2 on o1.project=o2.project
where o1.id!=o2.id
group by o1.id, o2.id;
compute stats indi_project_collab_org;
ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS;
create table indi_project_collab_org_country stored as parquet as
with tmp as
(select o.id organization, o.country , ro.project as project from organization o
create TEMPORARY TABLE tmp AS
select o.id organization, o.country , ro.project as project from organization o
join organization_projects ro on o.id=ro.id
and o.country <> 'UNKNOWN')
and o.country <> 'UNKNOWN';
create table if not exists indi_project_collab_org_country stored as parquet as
select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1
join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country
group by o1.organization, o2.country;
compute stats indi_project_collab_org_country;
drop table tmp purge;
create table indi_funder_country_collab stored as parquet as
ANALYZE TABLE indi_project_collab_org_country COMPUTE STATISTICS;
create table if not exists indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op
join organization o on o.id=op.id
join project p on p.id=op.project
@ -173,36 +155,26 @@ from tmp as f1
where f1.country<>f2.country
group by f1.funder, f2.country, f1.country;
compute stats indi_funder_country_collab;
--
-- create table indi_result_country_collab stored as parquet as
-- with tmp as
-- (select country, ro.id as result,r.type from organization o
-- join result_organization ro on o.id=ro.organization
-- join result r on r.id=ro.id where country <> 'UNKNOWN')
-- select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations
-- from tmp as o1
-- join tmp as o2 on o1.result=o2.result
-- where o1.country<>o2.country
-- group by o1.country, o2.country, o1.type;
--
-- compute stats indi_result_country_collab;
ANALYZE TABLE indi_funder_country_collab COMPUTE STATISTICS;
create table indi_result_country_collab stored as parquet as
with tmp as
(select distinct country, ro.id as result from organization o
create TEMPORARY TABLE tmp AS
select distinct country, ro.id as result from organization o
join result_organization ro on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null)
where country <> 'UNKNOWN' and o.name is not null;
create table if not exists indi_result_country_collab stored as parquet as
select o1.country country1, o2.country country2, count(o1.result) as collaborations
from tmp as o1
join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country
group by o1.country, o2.country;
compute stats indi_result_country_collab;
drop table tmp purge;
ANALYZE TABLE indi_result_country_collab COMPUTE STATISTICS;
---- Sprint 4 ----
create table indi_pub_diamond stored as parquet as
create table if not exists indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from publication_datasources pd
left outer join (
@ -212,21 +184,9 @@ from publication_datasources pd
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
on pd.id=tmp.id;
compute stats indi_pub_diamond;
ANALYZE TABLE indi_pub_diamond COMPUTE STATISTICS;
--create table indi_pub_hybrid stored as parquet as
--select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
--from publication_datasources pd
-- left outer join (
-- select pd.id, 1 as is_hybrid from publication_datasources pd
-- join datasource d on d.id=pd.datasource
-- join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
-- and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
-- on pd.id=tmp.id;
--
--compute stats indi_pub_hybrid;
create table indi_pub_in_transformative stored as parquet as
create table if not exists indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from publication pd
left outer join (
@ -236,9 +196,9 @@ from publication pd
and ps.is_transformative_journal=true) tmp
on pd.id=tmp.id;
compute stats indi_pub_in_transformative;
ANALYZE TABLE indi_pub_in_transformative COMPUTE STATISTICS;
create table indi_pub_closed_other_open stored as parquet as
create table if not exists indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri
left outer join
(select ri.id, 1 as pub_closed_other_open from result_instance ri
@ -248,180 +208,16 @@ select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_op
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp
on tmp.id=ri.id;
compute stats indi_pub_closed_other_open;
ANALYZE TABLE indi_pub_closed_other_open COMPUTE STATISTICS;
---- Sprint 5 ----
create table indi_result_no_of_copies stored as parquet as
create table if not exists indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from result_instance group by id;
compute stats indi_result_no_of_copies;
ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ----
--create table indi_pub_gold_oa stored as parquet as
--WITH gold_oa AS (
-- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_1 != ""
-- UNION ALL
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_2 != "" ),
--issn AS (
-- SELECT * FROM
-- (SELECT id, issn_printed as issn
-- FROM datasource WHERE issn_printed IS NOT NULL
-- UNION
-- SELECT id, issn_online as issn
-- FROM datasource WHERE issn_online IS NOT NULL) as issn
-- WHERE LENGTH(issn) > 7)
--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
--FROM publication_datasources pd
--LEFT OUTER JOIN (
-- SELECT pd.id, 1 as is_gold FROM publication_datasources pd
-- JOIN issn on issn.id=pd.datasource
-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--compute stats indi_pub_gold_oa;
--
--create table indi_datasets_gold_oa stored as parquet as
--WITH gold_oa AS (
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_1 != ""
-- UNION
-- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_2 != "" ),
--issn AS (
-- SELECT *
-- FROM (
-- SELECT id,issn_printed as issn
-- FROM datasource
-- WHERE issn_printed IS NOT NULL
-- UNION
-- SELECT id, issn_online as issn
-- FROM datasource
-- WHERE issn_online IS NOT NULL ) as issn
-- WHERE LENGTH(issn) > 7)
--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
--FROM dataset_datasources pd
--LEFT OUTER JOIN (
-- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd
-- JOIN issn on issn.id=pd.datasource
-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--
--compute stats indi_datasets_gold_oa;
--create table indi_software_gold_oa stored as parquet as
--WITH gold_oa AS (
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_1 != ""
-- UNION
-- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_2 != "" ),
--issn AS (
-- SELECT *
-- FROM (
-- SELECT id,issn_printed as issn
-- FROM datasource
-- WHERE issn_printed IS NOT NULL
-- UNION
-- SELECT id, issn_online as issn
-- FROM datasource
-- WHERE issn_online IS NOT NULL ) as issn
-- WHERE LENGTH(issn) > 7)
--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
--FROM software_datasources pd
--LEFT OUTER JOIN (
-- SELECT pd.id, 1 as is_gold FROM software_datasources pd
-- JOIN issn on issn.id=pd.datasource
-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--
--compute stats indi_software_gold_oa;
--create table indi_org_findable stored as parquet as
--with result_with_pid as (
-- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
-- join result_pids rp on rp.id=ro.id
-- group by ro.organization),
--result_has_abstract as (
-- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro
-- join result rp on rp.id=ro.id where rp.abstract=true
-- group by ro.organization),
--allresults as (
-- select organization, count(distinct id) no_allresults from result_organization
-- group by organization),
--result_with_pid_share as (
-- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share
-- from allresults
-- join result_with_pid on result_with_pid.organization=allresults.organization),
--result_with_abstract_share as (
-- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share
-- from allresults
-- join result_has_abstract on result_has_abstract.organization=allresults.organization)
--select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable
--from allresults
--join result_with_pid_share on result_with_pid_share.organization=allresults.organization
--left outer join (
-- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization;
--
--compute stats indi_org_findable;
--
--create table indi_org_openess stored as parquet as
--WITH datasets_oa as (
-- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join dataset ds on dg.id=ds.id
-- WHERE dg.is_gold=1
-- group by ro.organization),
--software_oa as (
-- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join software ds on dg.id=ds.id
-- WHERE dg.is_gold=1
-- group by ro.organization),
--pubs_oa as (
-- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join publication ds on dg.id=ds.id
-- where dg.is_gold=1
-- group by ro.organization),
--allpubs as (
-- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
-- join publication ps on ps.id=ro.id
-- group by ro.organization),
--alldatasets as (
-- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
-- join dataset ps on ps.id=ro.id
-- group by ro.organization),
--allsoftware as (
-- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
-- join software ps on ps.id=ro.id
-- group by ro.organization),
--allpubsshare as (
-- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
-- join pubs_oa on allpubs.organization=pubs_oa.organization),
--alldatasetssshare as (
-- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c
-- from alldatasets
-- join datasets_oa on alldatasets.organization=datasets_oa.organization),
--allsoftwaresshare as (
-- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
-- from allsoftware
-- join software_oa on allsoftware.organization=software_oa.organization)
--select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess
--FROM allpubsshare
--left outer join (
-- select organization,c from
-- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization
--left outer join (
-- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization;
--
--compute stats indi_org_openess;
--
create table indi_pub_hybrid_oa_with_cc stored as parquet as
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
FROM stats_ext.plan_s_jn
@ -436,7 +232,7 @@ create table indi_pub_hybrid_oa_with_cc stored as parquet as
SELECT id, issn_printed as issn
FROM datasource
WHERE issn_printed IS NOT NULL
UNION
UNION ALL
SELECT id,issn_online as issn
FROM datasource
WHERE issn_online IS NOT NULL ) as issn
@ -451,45 +247,44 @@ FROM publication_datasources pd
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
where cc.has_cc_license=1) tmp on pd.id=tmp.id;
compute stats indi_pub_hybrid_oa_with_cc;
ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table indi_pub_downloads stored as parquet as
create table if not exists indi_pub_downloads stored as parquet as
SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats
join publication on result_id=id
where downloads>0
GROUP BY result_id
order by no_downloads desc;
compute stats indi_pub_downloads;
ANALYZE TABLE indi_pub_downloads COMPUTE STATISTICS;
create table indi_pub_downloads_datasource stored as parquet as
create table if not exists indi_pub_downloads_datasource stored as parquet as
SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats
join publication on result_id=id
where downloads>0
GROUP BY result_id, repository_id
order by result_id;
compute stats indi_pub_downloads_datasource;
ANALYZE TABLE indi_pub_downloads_datasource COMPUTE STATISTICS;
create table indi_pub_downloads_year stored as parquet as
SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us
create table if not exists indi_pub_downloads_year stored as parquet as
SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us
join publication on result_id=id where downloads>0
GROUP BY result_id, `year`
order by `year` asc;
GROUP BY result_id, substring(us.`date`, 1,4);
compute stats indi_pub_downloads_year;
ANALYZE TABLE indi_pub_downloads_year COMPUTE STATISTICS;
create table indi_pub_downloads_datasource_year stored as parquet as
create table if not exists indi_pub_downloads_datasource_year stored as parquet as
SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us
join publication on result_id=id
where downloads>0
GROUP BY result_id, repository_id, `year`
order by `year` asc, result_id;
GROUP BY result_id, repository_id, substring(us.`date`, 1,4);
compute stats indi_pub_downloads_datasource_year;
ANALYZE TABLE indi_pub_downloads_datasource_year COMPUTE STATISTICS;
---- Sprint 7 ----
create table indi_pub_gold_oa stored as parquet as
create table if not exists indi_pub_gold_oa stored as parquet as
WITH gold_oa AS ( SELECT
issn_l,
journal_is_in_doaj,
@ -518,7 +313,7 @@ create table indi_pub_gold_oa stored as parquet as
datasource
WHERE
issn_printed IS NOT NULL
UNION
UNION ALL
SELECT
id,
issn_online as issn
@ -538,9 +333,9 @@ FROM
JOIN gold_oa on issn.issn = gold_oa.issn) tmp
on pd.id=tmp.id;
compute stats indi_pub_gold_oa;
ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS;
create table indi_pub_hybrid stored as parquet as
create table if not exists indi_pub_hybrid stored as parquet as
WITH gold_oa AS ( SELECT
issn_l,
journal_is_in_doaj,
@ -571,7 +366,7 @@ create table indi_pub_hybrid stored as parquet as
datasource
WHERE
issn_printed IS NOT NULL
UNION
UNION ALL
SELECT
id,
issn_online as issn
@ -591,15 +386,15 @@ from publication_datasources pd
where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp
on pd.id=tmp.id;
compute stats indi_pub_hybrid;
ANALYZE TABLE indi_pub_hybrid COMPUTE STATISTICS;
create table indi_org_fairness stored as parquet as
create table if not exists indi_org_fairness stored as parquet as
--return results with PIDs, and rich metadata group by organization
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result r on r.id=ro.id
--join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003
where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization),
--return all results group by organization
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
@ -611,16 +406,16 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu
from allresults
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness;
ANALYZE TABLE indi_org_fairness COMPUTE STATISTICS;
create table indi_org_fairness_pub_pr stored as parquet as
create table if not exists indi_org_fairness_pub_pr stored as parquet as
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair
from result_organization ro
join publication p on p.id=ro.id
join indi_pub_doi_from_crossref dc on dc.id=p.id
join indi_pub_grey_lit gl on gl.id=p.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null)
where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null)
and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0
group by ro.organization),
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
@ -632,150 +427,180 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu
from allresults
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness_pub_pr;
ANALYZE TABLE indi_org_fairness_pub_pr COMPUTE STATISTICS;
create table indi_org_fairness_pub_year stored as parquet as
with result_fair as
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join publication p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization, year),
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id
CREATE TEMPORARY table result_fair as
select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization, year;
CREATE TEMPORARY TABLE allresults as select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result p on p.id=ro.id
where cast(year as int)>2003
group by organization, year)
group by organization, year;
create table if not exists indi_org_fairness_pub_year stored as parquet as
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year;
compute stats indi_org_fairness_pub_year;
DROP table result_fair purge;
DROP table allresults purge;
create table indi_org_fairness_pub as
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair
ANALYZE TABLE indi_org_fairness_pub_year COMPUTE STATISTICS;
CREATE TEMPORARY TABLE result_fair as
select ro.organization organization, count(distinct ro.id) no_result_fair
from result_organization ro
join publication p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null)
join result p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null)
and (authors>0) and cast(year as int)>2003
group by ro.organization),
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id
group by ro.organization;
CREATE TEMPORARY TABLE allresults as
select organization, count(distinct ro.id) no_allresults from result_organization ro
join result p on p.id=ro.id
where cast(year as int)>2003
group by organization)
group by organization;
create table if not exists indi_org_fairness_pub as
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization;
from allresults join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness_pub;
DROP table result_fair purge;
DROP table allresults purge;
create table indi_org_fairness_year stored as parquet as
with result_fair as
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
ANALYZE TABLE indi_org_fairness_pub COMPUTE STATISTICS;
CREATE TEMPORARY TABLE result_fair as
select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result r on r.id=ro.id
join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization, year),
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization, year;
CREATE TEMPORARY TABLE allresults as
select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where cast(year as int)>2003
group by organization, year)
--return results_fair/all_results
group by organization, year;
create table if not exists indi_org_fairness_year stored as parquet as
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year;
compute stats indi_org_fairness_year;
DROP table result_fair purge;
DROP table allresults purge;
create table indi_org_findable_year stored as parquet as
--return results with PIDs group by organization,year
with result_with_pid as
(select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
ANALYZE TABLE indi_org_fairness_year COMPUTE STATISTICS;
CREATE TEMPORARY TABLE result_with_pid as
select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id
join result r on r.id=rp.id
where cast(year as int) >2003
group by ro.organization, year),
--return all results group by organization,year
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
group by ro.organization, year;
CREATE TEMPORARY TABLE allresults as
select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where cast(year as int) >2003
group by organization, year)
--return results_with_pid/all_results
group by organization, year;
create table if not exists indi_org_findable_year stored as parquet as
select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults
join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year;
compute stats indi_org_findable_year;
DROP table result_with_pid purge;
DROP table allresults purge;
create table indi_org_findable stored as parquet as
--return results with PIDs group by organization
with result_with_pid as
(select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
ANALYZE TABLE indi_org_findable_year COMPUTE STATISTICS;
CREATE TEMPORARY TABLE result_with_pid as
select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id
join result r on r.id=rp.id
where cast(year as int) >2003
group by ro.organization),
--return all results group by organization
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
group by ro.organization;
CREATE TEMPORARY TABLE allresults as
select organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where cast(year as int) >2003
group by organization)
--return results_with_pid/all_results
group by organization;
create table if not exists indi_org_findable stored as parquet as
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults
join result_with_pid on result_with_pid.organization=allresults.organization;
compute stats indi_org_findable;
DROP table result_with_pid purge;
DROP table allresults purge;
create table indi_org_openess stored as parquet as
WITH pubs_oa as (
ANALYZE TABLE indi_org_findable COMPUTE STATISTICS;
CREATE TEMPORARY TABLE pubs_oa as
SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
datasets_oa as (
group by ro.organization;
CREATE TEMPORARY TABLE datasets_oa as
SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
software_oa as (
group by ro.organization;
CREATE TEMPORARY TABLE software_oa as
SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
allpubs as (
group by ro.organization;
CREATE TEMPORARY TABLE allpubs as
SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join publication ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
alldatasets as (
group by ro.organization;
CREATE TEMPORARY TABLE alldatasets as
SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join dataset ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
allsoftware as (
group by ro.organization;
CREATE TEMPORARY TABLE allsoftware as
SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join software ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
allpubsshare as (
group by ro.organization;
CREATE TEMPORARY TABLE allpubsshare as
select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization),
alldatasetssshare as (
join pubs_oa on allpubs.organization=pubs_oa.organization;
CREATE TEMPORARY TABLE alldatasetssshare as
select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d
from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization),
allsoftwaresshare as (
join datasets_oa on alldatasets.organization=datasets_oa.organization;
CREATE TEMPORARY TABLE allsoftwaresshare as
select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware
join software_oa on allsoftware.organization=software_oa.organization)
join software_oa on allsoftware.organization=software_oa.organization;
create table if not exists indi_org_openess stored as parquet as
select allpubsshare.organization,
(p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end)
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare
left outer join (select organization,d from
@ -785,55 +610,75 @@ select allpubsshare.organization,
allsoftwaresshare) tmp2
on tmp2.organization=allpubsshare.organization;
compute stats indi_org_openess;
DROP TABLE pubs_oa purge;
DROP TABLE datasets_oa purge;
DROP TABLE software_oa purge;
DROP TABLE allpubs purge;
DROP TABLE alldatasets purge;
DROP TABLE allsoftware purge;
DROP TABLE allpubsshare purge;
DROP TABLE alldatasetssshare purge;
DROP TABLE allsoftwaresshare purge;
create table indi_org_openess_year stored as parquet as
WITH pubs_oa as (
ANALYZE TABLE indi_org_openess COMPUTE STATISTICS;
CREATE TEMPORARY TABLE pubs_oa AS
SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization,r.year),
datasets_oa as (
group by ro.organization,r.year;
CREATE TEMPORARY TABLE datasets_oa AS
SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization, r.year),
software_oa as (
group by ro.organization, r.year;
CREATE TEMPORARY TABLE software_oa AS
SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization, r.year),
allpubs as (
group by ro.organization, r.year;
CREATE TEMPORARY TABLE allpubs as
SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join publication p on p.id=ro.id where cast(p.year as int)>2003
group by ro.organization, p.year),
alldatasets as (
group by ro.organization, p.year;
CREATE TEMPORARY TABLE alldatasets as
SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join dataset d on d.id=ro.id where cast(d.year as int)>2003
group by ro.organization, d.year),
allsoftware as (
group by ro.organization, d.year;
CREATE TEMPORARY TABLE allsoftware as
SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join software s on s.id=ro.id where cast(s.year as int)>2003
group by ro.organization, s.year),
allpubsshare as (
group by ro.organization, s.year;
CREATE TEMPORARY TABLE allpubsshare as
select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)),
alldatasetssshare as (
join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int);
CREATE TEMPORARY TABLE alldatasetssshare as
select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d
from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)),
allsoftwaresshare as (
join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int);
CREATE TEMPORARY TABLE allsoftwaresshare as
select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware
join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int))
join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int);
create table if not exists indi_org_openess_year stored as parquet as
select allpubsshare.year, allpubsshare.organization,
(p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end)
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare
left outer join (select year, organization,d from
@ -843,9 +688,19 @@ select allpubsshare.year, allpubsshare.organization,
allsoftwaresshare) tmp2
on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year;
compute stats indi_org_openess_year;
DROP TABLE pubs_oa purge;
DROP TABLE datasets_oa purge;
DROP TABLE software_oa purge;
DROP TABLE allpubs purge;
DROP TABLE alldatasets purge;
DROP TABLE allsoftware purge;
DROP TABLE allpubsshare purge;
DROP TABLE alldatasetssshare purge;
DROP TABLE allsoftwaresshare purge;
create table indi_pub_has_preprint stored as parquet as
ANALYZE TABLE indi_org_openess_year COMPUTE STATISTICS;
create table if not exists indi_pub_has_preprint stored as parquet as
select distinct p.id, coalesce(has_preprint, 0) as has_preprint
from publication_classifications p
left outer join (
@ -854,9 +709,9 @@ from publication_classifications p
where p.type='Preprint') tmp
on p.id= tmp.id;
compute stats indi_pub_has_preprint;
ANALYZE TABLE indi_pub_has_preprint COMPUTE STATISTICS;
create table indi_pub_in_subscribed stored as parquet as
create table if not exists indi_pub_in_subscribed stored as parquet as
select distinct p.id, coalesce(is_subscription, 0) as is_subscription
from publication p
left outer join(
@ -867,9 +722,9 @@ from publication p
where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp
on p.id=tmp.id;
compute stats indi_pub_in_subscribed;
ANALYZE TABLE indi_pub_in_subscribed COMPUTE STATISTICS;
create table indi_result_with_pid as
create table if not exists indi_result_with_pid as
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
from result p
left outer join (
@ -877,4 +732,4 @@ from result p
from result_pids p) tmp
on p.id= tmp.id;
compute stats indi_result_with_pid;
ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS;

View File

@ -10,6 +10,11 @@ create view if not exists TARGET.creation_date as select * from SOURCE.creation_
create view if not exists TARGET.funder as select * from SOURCE.funder;
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
create table TARGET.result stored as parquet as
select distinct * from (
@ -54,84 +59,87 @@ create table TARGET.result stored as parquet as
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork
) )) foo;
compute stats TARGET.result;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations;
ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_references_oc;
ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations_oc;
ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_classifications;
ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_apc;
ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_concepts;
ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_datasources;
ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fundercount;
ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_gold;
ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_greenoa;
ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_languages;
ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_licenses;
ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_oids;
ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_organization;
ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_peerreviewed;
ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_pids;
ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_projectcount;
ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_projects;
ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_refereed;
ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_sources;
ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics;
ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fos;
ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1;
drop view TARGET.foo2;
compute stats TARGET.result_result;
ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
-- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource;
@ -140,7 +148,7 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
compute stats TARGET.datasource_results;
ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
-- organizations
create view if not exists TARGET.organization as select * from SOURCE.organization;
@ -157,28 +165,28 @@ create view if not exists TARGET.project_resultcount as select * from SOURCE.pro
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
compute stats TARGET.project_results;
ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
-- indicators
-- Sprint 1 ----
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_green_oa;
ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_grey_lit;
ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_doi_from_crossref;
ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
-- Sprint 2 ----
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence;
ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence_url;
ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_has_abstract;
ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_with_orcid;
ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
---- Sprint 3 ----
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_funded_result_with_fundref;
ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
@ -187,30 +195,30 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
---- Sprint 4 ----
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_diamond;
ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_in_transformative;
ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_closed_other_open;
ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
---- Sprint 5 ----
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_no_of_copies;
ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ----
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_hybrid_oa_with_cc;
ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads;
ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_datasource;
ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_year;
ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_datasource_year;
ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
---- Sprint 7 ----
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_gold_oa;
ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_hybrid;
ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
@ -221,11 +229,12 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--compute stats TARGET.indi_datasets_gold_oa;
--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--compute stats TARGET.indi_software_gold_oa;

View File

@ -8,6 +8,8 @@ from ${stats_db_name}.result r
group by rl.id
) rln on rln.id=r.id;
ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_country stored as parquet as
select
count(distinct r.id) as total,
@ -37,6 +39,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_year stored as parquet as
select
count(distinct r.id) as total,
@ -66,6 +70,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
select
count(distinct r.id) as total,
@ -95,6 +101,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
select
count(distinct r.id) as total,
@ -126,6 +134,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
select
count(distinct r.id) as total,
@ -157,6 +167,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
select
count(distinct r.id) as total,
@ -186,6 +198,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
select
count(distinct r.id) as total,
@ -215,6 +229,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
select
count(distinct r.id) as total,
@ -246,6 +262,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
select
count(distinct r.id) as total,
@ -277,6 +295,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_country stored as parquet as
select
count(distinct r.id) as total,
@ -308,6 +328,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_year stored as parquet as
select
count(distinct r.id) as total,
@ -339,6 +361,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
select
count(distinct r.id) as total,
@ -370,6 +394,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
select
count(distinct r.id) as total,
@ -401,6 +427,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
select
count(distinct r.id) as total,
@ -432,6 +460,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_organization stored as parquet as
select
count(distinct r.id) as total,
@ -463,6 +493,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
select
count(distinct r.id) as total,
@ -494,6 +526,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_funder stored as parquet as
select
count(distinct r.id) as total,
@ -527,6 +561,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
select
count(distinct r.id) as total,
@ -559,3 +595,5 @@ from ${stats_db_name}.result r
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS;

View File

@ -96,6 +96,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources;

View File

@ -74,7 +74,7 @@
</configuration>
</global>
<start to="Contexts"/>
<start to="Step1"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -302,23 +302,23 @@
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step19-finalize"/>
<error to="Kill"/>
</action>
<action name="Step19-finalize">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizedb.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${stats_db_shadow_name}</argument>
<file>finalizedb.sh</file>
</shell>
<ok to="step20-createMonitorDB"/>
<error to="Kill"/>
</action>
<!-- <action name="Step19-finalize">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>finalizedb.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${stats_db_shadow_name}</argument>-->
<!-- <file>finalizedb.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step20-createMonitorDB"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="step20-createMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
@ -355,67 +355,67 @@
<param>stats_db_name=${stats_db_name}</param>
<param>observatory_db_name=${observatory_db_name}</param>
</hive2>
<ok to="step21-createObservatoryDB-post"/>
<error to="Kill"/>
</action>
<action name="step21-createObservatoryDB-post">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>observatory-post.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<file>observatory-post.sh</file>
</shell>
<ok to="step22-copyDataToImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="step22-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<argument>${external_stats_db_name}</argument>
<argument>${stats_db_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${observatory_db_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="step23-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="step23-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${stats_db_shadow_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="Step24-updateCache"/>
<error to="Kill"/>
</action>
<action name="Step24-updateCache">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>updateCache.sh</exec>
<argument>${stats_tool_api_url}</argument>
<file>updateCache.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<!-- <action name="step21-createObservatoryDB-post">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>observatory-post.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <argument>${observatory_db_shadow_name}</argument>-->
<!-- <file>observatory-post.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step22-copyDataToImpalaCluster"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<!-- <action name="step22-copyDataToImpalaCluster">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>copyDataToImpalaCluster.sh</exec>-->
<!-- <argument>${external_stats_db_name}</argument>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <file>copyDataToImpalaCluster.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step23-finalizeImpalaCluster"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<!-- <action name="step23-finalizeImpalaCluster">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>finalizeImpalaCluster.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${stats_db_shadow_name}</argument>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${monitor_db_shadow_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <argument>${observatory_db_shadow_name}</argument>-->
<!-- <file>finalizeImpalaCluster.sh</file>-->
<!-- </shell>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<!-- <action name="Step24-updateCache">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>updateCache.sh</exec>-->
<!-- <argument>${stats_tool_api_url}</argument>-->
<!-- <file>updateCache.sh</file>-->
<!-- </shell>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<end name="End"/>
</workflow-app>