1
0
Fork 0

code cleanup

This commit is contained in:
Antonis Lempesis 2024-03-22 10:16:49 +02:00
parent 459167ac2f
commit 4c40c96e30
1 changed files with 181 additions and 237 deletions

View File

@ -1,204 +1,154 @@
-- Sprint 1 ---- -- Sprint 1 ----
drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/
--create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
--select distinct p.id, coalesce(green_oa, 0) as green_oa
--from ${stats_db_name}.publication p
-- left outer join (
-- select p.id, 1 as green_oa
-- from ${stats_db_name}.publication p
-- join ${stats_db_name}.result_instance ri on ri.id = p.id
-- join ${stats_db_name}.datasource on datasource.id = ri.hostedby
-- where datasource.type like '%Repository%'
-- and (ri.accessright = 'Open Access'
-- or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp
-- on p.id= tmp.id;
create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa select distinct p.id, coalesce(green_oa, 0) as green_oa
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as green_oa select p.id, 1 as green_oa
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
join ${stats_db_name}.result_instance ri on ri.id = p.id join ${stats_db_name}.result_instance ri on ri.id = p.id
join ${stats_db_name}.datasource on datasource.id = ri.hostedby join ${stats_db_name}.datasource on datasource.id = ri.hostedby
where datasource.type like '%Repository%' where datasource.type like '%Repository%' and (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and datasource.name!='Other') tmp on p.id= tmp.id; /*EOS*/
and (ri.accessright = 'Open Access'
or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and datasource.name!='Other') tmp
on p.id= tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as
select distinct p.id, coalesce(grey_lit, 0) as grey_lit select distinct p.id, coalesce(grey_lit, 0) as grey_lit
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as grey_lit select p.id, 1 as grey_lit
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
join ${stats_db_name}.result_classifications rt on rt.id = p.id join ${stats_db_name}.result_classifications rt on rt.id = p.id
where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object')
not exists (select 1 from ${stats_db_name}.result_classifications rc where type ='Other literature type' and not exists (select 1 from ${stats_db_name}.result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; /*EOS*/
and rc.id=p.id)) tmp on p.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join left outer join (
(select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri
join ${stats_db_name}.datasource d on d.id = ri.collectedfrom join ${stats_db_name}.datasource d on d.id = ri.collectedfrom
where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp on tmp.id=p.id; /*EOS*/
on tmp.id=p.id; /*EOS*/
-- Sprint 2 ---- -- Sprint 2 ----
drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select r.id, license.type as lic from ${stats_db_name}.result r left outer join (
select r.id, license.type as lic from ${stats_db_name}.result r
join ${stats_db_name}.result_licenses as license on license.id = r.id join ${stats_db_name}.result_licenses as license on license.id = r.id
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc %') tmp where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc %') tmp on r.id= tmp.id; /*EOS*/
on r.id= tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host left outer join (
select r.id, lower(parse_url(license.type, "HOST")) as lic_host
from ${stats_db_name}.result r from ${stats_db_name}.result r
join ${stats_db_name}.result_licenses as license on license.id = r.id join ${stats_db_name}.result_licenses as license on license.id = r.id
WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp on r.id= tmp.id; /*EOS*/
on r.id= tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as
select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
from ${stats_db_name}.publication; /*EOS*/ from ${stats_db_name}.publication; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp left outer join (
on r.id= tmp.id; /*EOS*/ select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/
---- Sprint 3 ---- ---- Sprint 3 ----
drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as
select distinct r.result as id, coalesce(fundref, 0) as fundref select distinct r.result as id, coalesce(fundref, 0) as fundref
from ${stats_db_name}.project_results r from ${stats_db_name}.project_results r
left outer join (select distinct result, 1 as fundref from ${stats_db_name}.project_results left outer join (
where provenance='Harvested') tmp select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/
on r.result= tmp.result; /*EOS*/
-- create table indi_result_org_collab stored as parquet as
-- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations
-- from result_organization as o1
-- join result_organization as o2 on o1.id=o2.id and o1.organization!=o2.organization
-- group by o1.organization, o2.organization;
--
-- compute stats indi_result_org_collab;
--
create TEMPORARY VIEW tmp AS SELECT ro.organization organization, ro.id, o.name from ${stats_db_name}.result_organization ro
join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_org_collab purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_org_collab purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_org_collab stored as parquet as create table if not exists ${stats_db_name}.indi_result_org_collab stored as parquet as
select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations WITH tmp AS (
from tmp as o1 SELECT ro.organization organization, ro.id, o.name
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name from ${stats_db_name}.result_organization ro
group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/ join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null)
select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
DROP VIEW if exists tmp; /*EOS*/ from tmp as o1
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
create TEMPORARY VIEW tmp AS group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/
select distinct ro.organization organization, ro.id, o.name, o.country from ${stats_db_name}.result_organization ro
join ${stats_db_name}.organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_org_country_collab purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_org_country_collab purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_org_country_collab stored as parquet as create table if not exists ${stats_db_name}.indi_result_org_country_collab stored as parquet as
select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations WITH tmp AS (
from tmp as o1 join tmp as o2 on o1.id=o2.id select distinct ro.organization organization, ro.id, o.name, o.country
where o1.id=o2.id and o1.country!=o2.country from ${stats_db_name}.result_organization ro
group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/ join ${stats_db_name}.organization o on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null)
DROP VIEW if exists tmp; /*EOS*/ select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
from tmp as o1 join tmp as o2 on o1.id=o2.id
create TEMPORARY VIEW tmp AS where o1.id=o2.id and o1.country!=o2.country
select o.id organization, o.name, ro.project as project from ${stats_db_name}.organization o group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_project_collab_org purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_project_collab_org purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_project_collab_org stored as parquet as create table if not exists ${stats_db_name}.indi_project_collab_org stored as parquet as
select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations WITH tmp AS (
from tmp as o1 select o.id organization, o.name, ro.project as project
from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null)
select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.name<>o2.name where o1.organization<>o2.organization and o1.name<>o2.name
group by o1.name,o2.name, o1.organization, o2.organization; /*EOS*/ group by o1.name,o2.name, o1.organization, o2.organization; /*EOS*/
DROP VIEW if exists tmp; /*EOS*/
create TEMPORARY VIEW tmp AS
select o.id organization, o.name, o.country , ro.project as project from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id
and o.country <> 'UNKNOWN' and o.name is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_project_collab_org_country purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_project_collab_org_country purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_project_collab_org_country stored as parquet as create table if not exists ${stats_db_name}.indi_project_collab_org_country stored as parquet as
select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations WITH tmp AS (
from tmp as o1 select o.id organization, o.name, o.country , ro.project as project
from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null)
select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country where o1.organization<>o2.organization and o1.country<>o2.country
group by o1.organization, o2.country, o1.name; /*EOS*/ group by o1.organization, o2.country, o1.name; /*EOS*/
DROP VIEW if exists tmp; /*EOS*/
drop table if exists ${stats_db_name}.indi_funder_country_collab purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_funder_country_collab purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_funder_country_collab stored as parquet as create table if not exists ${stats_db_name}.indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from ${stats_db_name}.organization_projects op with tmp as (select funder, project, country from ${stats_db_name}.organization_projects op
join ${stats_db_name}.organization o on o.id=op.id join ${stats_db_name}.organization o on o.id=op.id
join ${stats_db_name}.project p on p.id=op.project join ${stats_db_name}.project p on p.id=op.project
where country <> 'UNKNOWN') where country <> 'UNKNOWN')
select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
from tmp as f1 from tmp as f1
join tmp as f2 on f1.project=f2.project join tmp as f2 on f1.project=f2.project
where f1.country<>f2.country where f1.country<>f2.country
group by f1.funder, f2.country, f1.country; /*EOS*/ group by f1.funder, f2.country, f1.country; /*EOS*/
create TEMPORARY VIEW tmp AS
select distinct country, ro.id as result from ${stats_db_name}.organization o
join ${stats_db_name}.result_organization ro on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_country_collab purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_country_collab purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_country_collab stored as parquet as create table if not exists ${stats_db_name}.indi_result_country_collab stored as parquet as
select o1.country country1, o2.country country2, count(o1.result) as collaborations WITH tmp AS (
from tmp as o1 select distinct country, ro.id as result from ${stats_db_name}.organization o
join ${stats_db_name}.result_organization ro on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null)
select o1.country country1, o2.country country2, count(o1.result) as collaborations
from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country where o1.country<>o2.country
group by o1.country, o2.country; /*EOS*/ group by o1.country, o2.country; /*EOS*/
DROP VIEW if exists tmp; /*EOS*/
---- Sprint 4 ---- ---- Sprint 4 ----
drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from ${stats_db_name}.publication_datasources pd from ${stats_db_name}.publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as in_diamond_journal select pd.id, 1 as in_diamond_journal
from ${stats_db_name}.publication_datasources pd from ${stats_db_name}.publication_datasources pd
join ${stats_db_name}.datasource d on d.id=pd.datasource join ${stats_db_name}.datasource d on d.id=pd.datasource
@ -206,11 +156,10 @@ left outer join (
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; /*EOS*/ and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from ${stats_db_name}.publication pd from ${stats_db_name}.publication pd
left outer join ( left outer join (
select pd.id, 1 as is_transformative select pd.id, 1 as is_transformative
from ${stats_db_name}.publication_datasources pd from ${stats_db_name}.publication_datasources pd
join ${stats_db_name}.datasource d on d.id=pd.datasource join ${stats_db_name}.datasource d on d.id=pd.datasource
@ -218,11 +167,10 @@ left outer join (
and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; /*EOS*/ and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
from ${stats_db_name}.result_instance ri from ${stats_db_name}.result_instance ri
left outer join ( left outer join (
select ri.id, 1 as pub_closed_other_open select ri.id, 1 as pub_closed_other_open
from ${stats_db_name}.result_instance ri from ${stats_db_name}.result_instance ri
join ${stats_db_name}.publication p on p.id=ri.id join ${stats_db_name}.publication p on p.id=ri.id
@ -230,55 +178,52 @@ left outer join (
where d.type like '%Journal%' and ri.accessright='Closed Access' and where d.type like '%Journal%' and ri.accessright='Closed Access' and
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp on tmp.id=ri.id; /*EOS*/ (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp on tmp.id=ri.id; /*EOS*/
---- Sprint 5 ---- ---- Sprint 5 ----
drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from ${stats_db_name}.result_instance group by id; /*EOS*/ select id, count(id) as number_of_copies
from ${stats_db_name}.result_instance
group by id; /*EOS*/
---- Sprint 6 ---- ---- Sprint 6 ----
drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as
SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats SELECT result_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
GROUP BY result_id GROUP BY result_id; /*EOS*/
order by no_downloads desc; /*EOS*/
--ANALYZE TABLE ${stats_db_name}.indi_pub_downloads COMPUTE STATISTICS;
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as
SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats SELECT result_id, repository_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
GROUP BY result_id, repository_id GROUP BY result_id, repository_id; /*EOS*/
order by result_id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us from openaire_prod_usage_stats.usage_stats us
join ${stats_db_name}.publication on result_id=id where downloads>0 join ${stats_db_name}.publication on result_id=id where downloads>0
GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/ GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
join ${stats_db_name}.publication on result_id=id from openaire_prod_usage_stats.usage_stats us
where downloads>0 join ${stats_db_name}.publication on result_id=id
GROUP BY result_id, repository_id, substring(us.`date`, 1,4); /*EOS*/ where downloads>0
GROUP BY result_id, repository_id, substring(us.`date`, 1,4); /*EOS*/
---- Sprint 7 ---- ---- Sprint 7 ----
drop table if exists ${stats_db_name}.indi_pub_gold_oa purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_gold_oa purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet as create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet as
with gold_oa as ( with gold_oa as (
select distinct issn from ( select distinct issn from (
SELECT issn_l as issn from stats_ext.issn_gold_oa_dataset_v5 SELECT issn_l as issn from stats_ext.issn_gold_oa_dataset_v5
UNION ALL UNION ALL
SELECT issn as issn from stats_ext.issn_gold_oa_dataset_v5 SELECT issn as issn from stats_ext.issn_gold_oa_dataset_v5
@ -286,8 +231,8 @@ select distinct issn from (
select issn from stats_ext.alljournals where journal_is_in_doaj=true or journal_is_oa=true select issn from stats_ext.alljournals where journal_is_in_doaj=true or journal_is_oa=true
UNION ALL UNION ALL
select issn_l as issn from stats_ext.alljournals where journal_is_in_doaj=true or journal_is_oa=true) foo), select issn_l as issn from stats_ext.alljournals where journal_is_in_doaj=true or journal_is_oa=true) foo),
dd as ( dd as (
select distinct * from ( select distinct * from (
select id, issn_printed as issn from ${stats_db_name}.datasource d where d.id like '%doajarticles%' select id, issn_printed as issn from ${stats_db_name}.datasource d where d.id like '%doajarticles%'
UNION ALL UNION ALL
select id, issn_online as issn from ${stats_db_name}.datasource d where d.id like '%doajarticles%' select id, issn_online as issn from ${stats_db_name}.datasource d where d.id like '%doajarticles%'
@ -295,17 +240,16 @@ select distinct * from (
select id, issn_printed as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_printed select id, issn_printed as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_printed
UNION ALL UNION ALL
select id, issn_online as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_online) foo select id, issn_online as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_online) foo
) )
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM ${stats_db_name}.publication_datasources pd FROM ${stats_db_name}.publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as is_gold select pd.id, 1 as is_gold
FROM ${stats_db_name}.publication_datasources pd FROM ${stats_db_name}.publication_datasources pd
join dd on dd.id=pd.datasource join dd on dd.id=pd.datasource
left outer join ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/ left outer join ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS ( WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
@ -326,9 +270,9 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
FROM ${stats_db_name}.datasource FROM ${stats_db_name}.datasource
WHERE issn_online IS NOT NULL ) as issn WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7) WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
FROM ${stats_db_name}.publication_datasources pd FROM ${stats_db_name}.publication_datasources pd
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd
JOIN ${stats_db_name}.datasource d on d.id=pd.datasource JOIN ${stats_db_name}.datasource d on d.id=pd.datasource
JOIN issn on issn.id=pd.datasource JOIN issn on issn.id=pd.datasource
@ -337,7 +281,6 @@ LEFT OUTER JOIN (
JOIN ${stats_db_name}.indi_pub_gold_oa ga on pd.id=ga.id where cc.has_cc_license=1 and ga.is_gold=0) tmp on pd.id=tmp.id; /*EOS*/ JOIN ${stats_db_name}.indi_pub_gold_oa ga on pd.id=ga.id where cc.has_cc_license=1 and ga.is_gold=0) tmp on pd.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as
select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication pd select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication pd
left outer join ( left outer join (
@ -349,23 +292,24 @@ left outer join (
where indi_gold.is_gold=0 and ((d.type like '%Journal%' and ri.accessright!='Closed Access' and ri.accessright!='Restricted' and ri.license is not null) or ra.accessroute='hybrid')) tmp on pd.id=tmp.id; /*EOS*/ where indi_gold.is_gold=0 and ((d.type like '%Journal%' and ri.accessright!='Closed Access' and ri.accessright!='Restricted' and ri.license is not null) or ra.accessroute='hybrid')) tmp on pd.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_org_fairness purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as
--return results with PIDs, and rich metadata group by organization --return results with PIDs, and rich metadata group by organization
with result_fair as with result_fair as (
(select ro.organization organization, count(distinct ro.id) no_result_fair from ${stats_db_name}.result_organization ro select ro.organization organization, count(distinct ro.id) no_result_fair
from ${stats_db_name}.result_organization ro
join ${stats_db_name}.result r on r.id=ro.id join ${stats_db_name}.result r on r.id=ro.id
--join result_pids rp on r.id=rp.id --join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization), group by ro.organization),
--return all results group by organization --return all results group by organization
allresults as (select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro allresults as (
select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name}.result_organization ro
join ${stats_db_name}.result r on r.id=ro.id join ${stats_db_name}.result r on r.id=ro.id
where cast(year as int)>2003 where cast(year as int)>2003
group by ro.organization) group by ro.organization)
--return results_fair/all_results --return results_fair/all_results
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization; /*EOS*/ join result_fair on result_fair.organization=allresults.organization; /*EOS*/
CREATE TEMPORARY VIEW result_fair as CREATE TEMPORARY VIEW result_fair as