[stats wf] final touches for the integration of PRs #166, #179 in the master branch

This commit is contained in:
Claudio Atzori 2022-01-12 12:04:31 +01:00
parent b053b0178e
commit 9acc32faa6
2 changed files with 29 additions and 224 deletions

View File

@ -42,181 +42,8 @@ join datasource on datasource.id = ri.hostedby
where datasource.id like '%doajarticles%') tmp where datasource.id like '%doajarticles%') tmp
on p.id= tmp.id; on p.id= tmp.id;
--create table indi_project_pubs_count stored as parquet as
--select pr.id id, count(p.id) total_pubs from project_results pr
--join publication p on p.id=pr.result
--group by pr.id;
--create table indi_project_datasets_count stored as parquet as
--select pr.id id, count(d.id) total_datasets from project_results pr
--join dataset d on d.id=pr.result
--group by pr.id;
--create table indi_project_software_count stored as parquet as
--select pr.id id, count(s.id) total_software from project_results pr
--join software s on s.id=pr.result
--group by pr.id;
--create table indi_project_otherresearch_count stored as parquet as
--select pr.id id, count(o.id) total_other from project_results pr
--join otherresearchproduct o on o.id=pr.result
--group by pr.id;
--create table indi_pub_avg_year_country_oa stored as parquet as
--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
--from
--(SELECT year, country, SUM(CASE
--WHEN bestlicence='Open Access' THEN 1
--ELSE 0
--END) AS OpenAccess, SUM(CASE
--WHEN bestlicence<>'Open Access' THEN 1
--ELSE 0
--END) AS NonOpenAccess
--FROM publication p
--join result_organization ro on p.id=ro.id
--join organization o on o.id=ro.organization
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by year, country) tmp;
--create table indi_dataset_avg_year_country_oa stored as parquet as
--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
--from
--(SELECT year, country, SUM(CASE
--WHEN bestlicence='Open Access' THEN 1
--ELSE 0
--END) AS OpenAccess, SUM(CASE
--WHEN bestlicence<>'Open Access' THEN 1
--ELSE 0
--END) AS NonOpenAccess
--FROM dataset d
--join result_organization ro on d.id=ro.id
--join organization o on o.id=ro.organization
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by year, country) tmp;
--create table indi_software_avg_year_country_oa stored as parquet as
--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
--from
-- (SELECT year, country, SUM(CASE
--WHEN bestlicence='Open Access' THEN 1
-- ELSE 0
--END) AS OpenAccess, SUM(CASE
-- WHEN bestlicence<>'Open Access' THEN 1
-- ELSE 0
-- END) AS NonOpenAccess
-- FROM software s
-- join result_organization ro on s.id=ro.id
-- join organization o on o.id=ro.organization
-- where cast(year as int)>=2003 and cast(year as int)<=2021
-- group by year, country) tmp;
--create table indi_other_avg_year_country_oa stored as parquet as
--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
-- from
-- (SELECT year, country, SUM(CASE
-- WHEN bestlicence='Open Access' THEN 1
-- ELSE 0
-- END) AS OpenAccess, SUM(CASE
-- WHEN bestlicence<>'Open Access' THEN 1
-- ELSE 0
-- END) AS NonOpenAccess
-- FROM otherresearchproduct orp
-- join result_organization ro on orp.id=ro.id
-- join organization o on o.id=ro.organization
-- where cast(year as int)>=2003 and cast(year as int)<=2021
-- group by year, country) tmp;
--create table indi_pub_avg_year_context_oa stored as parquet as
--with total as
--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc
--join context c on pc.concept like concat('%',c.id,'%')
--join publication p on p.id=pc.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by c.name, year )
--select year, name, round(no_of_pubs/total*100,3) averageofpubs
--from total;
--create table indi_dataset_avg_year_context_oa stored as parquet as
--with total as
--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc
--join context c on pc.concept like concat('%',c.id,'%')
--join dataset p on p.id=pc.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by c.name, year )
--select year, name, round(no_of_pubs/total*100,3) averageofdataset
--from total;
--create table indi_software_avg_year_context_oa stored as parquet as
--with total as
--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc
--join context c on pc.concept like concat('%',c.id,'%')
--join software p on p.id=pc.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by c.name, year )
--select year, name, round(no_of_pubs/total*100,3) averageofsoftware
--from total;
--create table indi_other_avg_year_context_oa stored as parquet as
--with total as
--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc
--join context c on pc.concept like concat('%',c.id,'%')
--join otherresearchproduct p on p.id=pc.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by c.name, year )
--select year, name, round(no_of_pubs/total*100,3) averageofother
--from total;
--create table indi_other_avg_year_content_oa stored as parquet as
--with total as
--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
--from otherresearchproduct_datasources pd
--join datasource d on datasource=d.id
--join otherresearchproduct p on p.id=pd.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by d.type, year)
--select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct
--from total;
--create table indi_software_avg_year_content_oa stored as parquet as
--with total as
--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
--from software_datasources pd
--join datasource d on datasource=d.id
--join software p on p.id=pd.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by d.type, year)
--select year, type, round(no_of_pubs/total*100,3) averageOfSoftware
--from total;
--create table indi_dataset_avg_year_content_oa stored as parquet as
--with total as
--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
--from dataset_datasources pd
--join datasource d on datasource=d.id
--join dataset p on p.id=pd.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by d.type, year)
--select year, type, round(no_of_pubs/total*100,3) averageOfDatasets
--from total;
--create table indi_pub_avg_year_content_oa stored as parquet as
--with total as
--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
--from publication_datasources pd
--join datasource d on datasource=d.id
--join publication p on p.id=pd.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by d.type, year)
--select year, type, round(no_of_pubs/total*100,3) averageOfPubs
--from total;
---- Sprint 2 ---- ---- Sprint 2 ----
create table indi_result_has_cc_licence_f stored as parquet as create table indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from result r from result r
left outer join (select r.id, license.type as lic from result r left outer join (select r.id, license.type as lic from result r
@ -233,35 +60,6 @@ join result_licenses as license on license.id = r.id
WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp
on r.id= tmp.id; on r.id= tmp.id;
-- EOSC-TR1.1-02M:
-- ## Indicator: has_cc_license. Creative Commons licensing has become a
-- de facto standard in scholarly communication and is promoted by many initiatives
-- like Plan S. This indicator might be only useful when applied
-- to openly available publications.
--create table indi_pub_has_cc_licence_tr stored as parquet as
--select distinct p.id, case when lic='' or lic is null then 0 else 1 end indi_result_org_collabas has_cc_license_tr
--from publication p
--left outer join (select p.id, license.type as lic from publication p
--join publication_licenses as license on license.id = p.id
--where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
--on p.id= tmp.id
-- #EOSC-F2-01M_cc Rich metadata for scholarly publications
-- ## Indicator: has_cc_license. Creative Commons licensing has become a
-- de facto standard in scholarly communication and is promoted by many initiatives
-- like Plan S. This indicator might be only useful when applied
-- to openly available publications.
-- Same indicator as EOSC-TR1.1-02M (Najko's instructions)
-- create table indi_pub_has_cc_licence_f stored as parquet as
-- select
-- distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_f
-- from publication p
-- left outer join (selectp.id,license.type as lic from publication p
-- join publication_licenses as license on license.id = p.id
-- where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
-- on p.id= tmp.id
create table indi_pub_has_abstract stored as parquet as create table indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract select distinct publication.id, coalesce(abstract, 1) has_abstract
from publication; from publication;
@ -270,16 +68,16 @@ create table indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from result r from result r
left outer join (select id, 1 as has_orcid from result_orcid) tmp left outer join (select id, 1 as has_orcid from result_orcid) tmp
on r.id= tmp.id on r.id= tmp.id;
---- Sprint 3 ---- ---- Sprint 3 ----
create table indi_funded_result_with_fundref stored as parquet as create table indi_funded_result_with_fundref stored as parquet as
select distinct r.id, coalesce(fundref, 0) as fundref select distinct r.id, coalesce(fundref, 0) as fundref
from project_results r from project_results r
left outer join (select distinct id, 1 as fundref from project_results left outer join (select distinct id, 1 as fundref from project_results
where provenance='Harvested') tmp where provenance='Harvested') tmp
on r.id= tmp.id on r.id= tmp.id;
create table indi_result_org_country_collab stored as parquet as create table indi_result_org_country_collab stored as parquet as
with tmp as with tmp as
@ -290,7 +88,7 @@ select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as col
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id and o1.country<>o2.country where o1.id<>o2.id and o1.country<>o2.country
group by o1.id, o1.type,o2.country group by o1.id, o1.type,o2.country;
create table indi_result_org_collab stored as parquet as create table indi_result_org_collab stored as parquet as
with tmp as with tmp as
@ -301,18 +99,18 @@ select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaboratio
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id where o1.id<>o2.id
group by o1.id, o2.id, o1.type group by o1.id, o2.id, o1.type;
create table indi_funder_country_collab stored as parquet as create table indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op
join organization o on o.id=op.id join organization o on o.id=op.id
join project p on p.id=op.project join project p on p.id=op.project
where country <> 'UNKNOWN') where country <> 'UNKNOWN')
select f1.funder, f1.country, f2.country, count(distinct f1.project) as collaborations select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
from tmp as f1 from tmp as f1
join tmp as f2 on f1.project=f2.project join tmp as f2 on f1.project=f2.project
where f1.country<>f2.country where f1.country<>f2.country
group by f1.funder, f2.country, f1.country group by f1.funder, f2.country, f1.country;
create table indi_result_country_collab stored as parquet as create table indi_result_country_collab stored as parquet as
with tmp as with tmp as
@ -323,10 +121,9 @@ select o1.country country1, o2.country country2, o1.type, count(distinct o1.resu
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country where o1.country<>o2.country
group by o1.country, o2.country, o1.type group by o1.country, o2.country, o1.type;
---- Sprint 4 ---- ---- Sprint 4 ----
create table indi_pub_diamond stored as parquet as create table indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from publication_datasources pd from publication_datasources pd
@ -335,7 +132,7 @@ select pd.id, 1 as in_diamond_journal from publication_datasources pd
join datasource d on d.id=pd.datasource join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
on pd.id=tmp.id on pd.id=tmp.id;
create table indi_pub_hybrid stored as parquet as create table indi_pub_hybrid stored as parquet as
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
@ -345,7 +142,7 @@ select pd.id, 1 as is_hybrid from publication_datasources pd
join datasource d on d.id=pd.datasource join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
on pd.id=tmp.id on pd.id=tmp.id;
create table indi_is_gold_oa stored as parquet as create table indi_is_gold_oa stored as parquet as
(select distinct pd.id, coalesce(gold_oa, 0) as gold_oa (select distinct pd.id, coalesce(gold_oa, 0) as gold_oa
@ -355,8 +152,7 @@ select pd.id, 1 as gold_oa from publication_datasources pd
join datasource d on d.id=pd.datasource join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps on (ps.issn_print=d.issn_printed or ps.issn_online=d.issn_online) join stats_ext.plan_s_jn ps on (ps.issn_print=d.issn_printed or ps.issn_online=d.issn_online)
where ps.journal_is_in_doaj is true or ps.journal_is_oa is true) tmp where ps.journal_is_in_doaj is true or ps.journal_is_oa is true) tmp
on pd.id=tmp.id) on pd.id=tmp.id);
create table indi_pub_in_transformative stored as parquet as create table indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
@ -366,7 +162,7 @@ select pd.id, 1 as is_transformative from publication_datasources pd
join datasource d on d.id=pd.datasource join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and ps.is_transformative_journal=true) tmp and ps.is_transformative_journal=true) tmp
on pd.id=tmp.id on pd.id=tmp.id;
create table indi_pub_closed_other_open stored as parquet as create table indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri
@ -376,9 +172,9 @@ join publication p on p.id=ri.id
join datasource d on ri.hostedby=d.id join datasource d on ri.hostedby=d.id
where d.type like '%Journal%' and ri.accessright='Closed Access' and where d.type like '%Journal%' and ri.accessright='Closed Access' and
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp
on tmp.id=ri.id on tmp.id=ri.id;
---- Sprint 5 ---- ---- Sprint 5 ----
create table indi_result_no_of_copies stored as parquet as create table indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from result_instance group by id select id, count(id) as number_of_copies from result_instance group by id;

View File

@ -49,8 +49,10 @@ compute stats TARGET.result_greenoa;
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_languages; compute stats TARGET.result_languages;
create table TARGET.result_licences stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_licences; compute stats TARGET.result_licenses;
create table TARGET.licenses_normalized as select * from SOURCE.licenses_normalized;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_oids; compute stats TARGET.result_oids;
@ -79,6 +81,9 @@ compute stats TARGET.result_sources;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics; compute stats TARGET.result_topics;
create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source or r.id=orig.target);
compute stats TARGET.result_result;
-- datasources -- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource; create view if not exists TARGET.datasource as select * from SOURCE.datasource;
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
@ -100,6 +105,7 @@ create view if not exists TARGET.project as select * from SOURCE.project;
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
compute stats TARGET.project_results; compute stats TARGET.project_results;
@ -115,8 +121,8 @@ create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.i
compute stats TARGET.indi_pub_gold_oa; compute stats TARGET.indi_pub_gold_oa;
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_has_abstract; compute stats TARGET.indi_pub_has_abstract;
create table TARGET.indi_result_has_cc_licence_f stored as parquet as select * from SOURCE.indi_result_has_cc_licence_f orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence_f; compute stats TARGET.indi_result_has_cc_licence;
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence_url; compute stats TARGET.indi_result_has_cc_licence_url;
@ -138,6 +144,9 @@ compute stats TARGET.indi_pub_closed_other_open;
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_no_of_copies; compute stats TARGET.indi_result_no_of_copies;
--- Usage statistics
create table TARGET.usage_stats stored as parquet as select * from SOURCE.usage_stats orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--denorm --denorm
alter table TARGET.result rename to TARGET.res_tmp; alter table TARGET.result rename to TARGET.res_tmp;