[stats wf] Added sprint 3&4 of indicators #166

Merged
claudio.atzori merged 5 commits from antonis.lempesis/dnet-hadoop:beta into beta 2021-11-29 10:40:26 +01:00
1 changed files with 237 additions and 168 deletions
Showing only changes of commit 29f69f2f89 - Show all commits

View File

@ -8,7 +8,7 @@ join result_instance ri on ri.id = p.id
join datasource on datasource.id = ri.hostedby join datasource on datasource.id = ri.hostedby
where datasource.type like '%Repository%' where datasource.type like '%Repository%'
and (ri.accessright = 'Open Access' and (ri.accessright = 'Open Access'
or ri.accessright = 'Embargo')) tmp or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp
on p.id= tmp.id; on p.id= tmp.id;
create table indi_pub_grey_lit stored as parquet as create table indi_pub_grey_lit stored as parquet as
@ -41,178 +41,178 @@ join datasource on datasource.id = ri.hostedby
where datasource.id like '%doajarticles%') tmp where datasource.id like '%doajarticles%') tmp
on p.id= tmp.id; on p.id= tmp.id;
create table indi_project_pubs_count stored as parquet as --create table indi_project_pubs_count stored as parquet as
select pr.id id, count(p.id) total_pubs from project_results pr --select pr.id id, count(p.id) total_pubs from project_results pr
join publication p on p.id=pr.result --join publication p on p.id=pr.result
group by pr.id; --group by pr.id;
create table indi_project_datasets_count stored as parquet as --create table indi_project_datasets_count stored as parquet as
select pr.id id, count(d.id) total_datasets from project_results pr --select pr.id id, count(d.id) total_datasets from project_results pr
join dataset d on d.id=pr.result --join dataset d on d.id=pr.result
group by pr.id; --group by pr.id;
create table indi_project_software_count stored as parquet as --create table indi_project_software_count stored as parquet as
select pr.id id, count(s.id) total_software from project_results pr --select pr.id id, count(s.id) total_software from project_results pr
join software s on s.id=pr.result --join software s on s.id=pr.result
group by pr.id; --group by pr.id;
create table indi_project_otherresearch_count stored as parquet as --create table indi_project_otherresearch_count stored as parquet as
select pr.id id, count(o.id) total_other from project_results pr --select pr.id id, count(o.id) total_other from project_results pr
join otherresearchproduct o on o.id=pr.result --join otherresearchproduct o on o.id=pr.result
group by pr.id; --group by pr.id;
create table indi_pub_avg_year_country_oa stored as parquet as --create table indi_pub_avg_year_country_oa stored as parquet as
select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, --select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA --round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
from --from
(SELECT year, country, SUM(CASE --(SELECT year, country, SUM(CASE
WHEN bestlicence='Open Access' THEN 1 --WHEN bestlicence='Open Access' THEN 1
ELSE 0 --ELSE 0
END) AS OpenAccess, SUM(CASE --END) AS OpenAccess, SUM(CASE
WHEN bestlicence<>'Open Access' THEN 1 --WHEN bestlicence<>'Open Access' THEN 1
ELSE 0 --ELSE 0
END) AS NonOpenAccess --END) AS NonOpenAccess
FROM publication p --FROM publication p
join result_organization ro on p.id=ro.id --join result_organization ro on p.id=ro.id
join organization o on o.id=ro.organization --join organization o on o.id=ro.organization
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by year, country) tmp; --group by year, country) tmp;
create table indi_dataset_avg_year_country_oa stored as parquet as --create table indi_dataset_avg_year_country_oa stored as parquet as
select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, --select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA --round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
from --from
(SELECT year, country, SUM(CASE --(SELECT year, country, SUM(CASE
WHEN bestlicence='Open Access' THEN 1 --WHEN bestlicence='Open Access' THEN 1
ELSE 0 --ELSE 0
END) AS OpenAccess, SUM(CASE --END) AS OpenAccess, SUM(CASE
WHEN bestlicence<>'Open Access' THEN 1 --WHEN bestlicence<>'Open Access' THEN 1
ELSE 0 --ELSE 0
END) AS NonOpenAccess --END) AS NonOpenAccess
FROM dataset d --FROM dataset d
join result_organization ro on d.id=ro.id --join result_organization ro on d.id=ro.id
join organization o on o.id=ro.organization --join organization o on o.id=ro.organization
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by year, country) tmp; --group by year, country) tmp;
create table indi_software_avg_year_country_oa stored as parquet as --create table indi_software_avg_year_country_oa stored as parquet as
select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, --select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA --round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
from --from
(SELECT year, country, SUM(CASE -- (SELECT year, country, SUM(CASE
WHEN bestlicence='Open Access' THEN 1 --WHEN bestlicence='Open Access' THEN 1
ELSE 0 -- ELSE 0
END) AS OpenAccess, SUM(CASE --END) AS OpenAccess, SUM(CASE
WHEN bestlicence<>'Open Access' THEN 1 -- WHEN bestlicence<>'Open Access' THEN 1
ELSE 0 -- ELSE 0
END) AS NonOpenAccess -- END) AS NonOpenAccess
FROM software s -- FROM software s
join result_organization ro on s.id=ro.id -- join result_organization ro on s.id=ro.id
join organization o on o.id=ro.organization -- join organization o on o.id=ro.organization
where cast(year as int)>=2003 and cast(year as int)<=2021 -- where cast(year as int)>=2003 and cast(year as int)<=2021
group by year, country) tmp; -- group by year, country) tmp;
create table indi_other_avg_year_country_oa stored as parquet as --create table indi_other_avg_year_country_oa stored as parquet as
select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, --select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA --round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
from -- from
(SELECT year, country, SUM(CASE -- (SELECT year, country, SUM(CASE
WHEN bestlicence='Open Access' THEN 1 -- WHEN bestlicence='Open Access' THEN 1
ELSE 0 -- ELSE 0
END) AS OpenAccess, SUM(CASE -- END) AS OpenAccess, SUM(CASE
WHEN bestlicence<>'Open Access' THEN 1 -- WHEN bestlicence<>'Open Access' THEN 1
ELSE 0 -- ELSE 0
END) AS NonOpenAccess -- END) AS NonOpenAccess
FROM otherresearchproduct orp -- FROM otherresearchproduct orp
join result_organization ro on orp.id=ro.id -- join result_organization ro on orp.id=ro.id
join organization o on o.id=ro.organization -- join organization o on o.id=ro.organization
where cast(year as int)>=2003 and cast(year as int)<=2021 -- where cast(year as int)>=2003 and cast(year as int)<=2021
group by year, country) tmp; -- group by year, country) tmp;
create table indi_pub_avg_year_context_oa stored as parquet as --create table indi_pub_avg_year_context_oa stored as parquet as
with total as --with total as
(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc --(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc
join context c on pc.concept like concat('%',c.id,'%') --join context c on pc.concept like concat('%',c.id,'%')
join publication p on p.id=pc.id --join publication p on p.id=pc.id
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by c.name, year ) --group by c.name, year )
select year, name, round(no_of_pubs/total*100,3) averageofpubs --select year, name, round(no_of_pubs/total*100,3) averageofpubs
from total; --from total;
create table indi_dataset_avg_year_context_oa stored as parquet as --create table indi_dataset_avg_year_context_oa stored as parquet as
with total as --with total as
(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc --(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc
join context c on pc.concept like concat('%',c.id,'%') --join context c on pc.concept like concat('%',c.id,'%')
join dataset p on p.id=pc.id --join dataset p on p.id=pc.id
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by c.name, year ) --group by c.name, year )
select year, name, round(no_of_pubs/total*100,3) averageofdataset --select year, name, round(no_of_pubs/total*100,3) averageofdataset
from total; --from total;
create table indi_software_avg_year_context_oa stored as parquet as --create table indi_software_avg_year_context_oa stored as parquet as
with total as --with total as
(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc --(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc
join context c on pc.concept like concat('%',c.id,'%') --join context c on pc.concept like concat('%',c.id,'%')
join software p on p.id=pc.id --join software p on p.id=pc.id
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by c.name, year ) --group by c.name, year )
select year, name, round(no_of_pubs/total*100,3) averageofsoftware --select year, name, round(no_of_pubs/total*100,3) averageofsoftware
from total; --from total;
create table indi_other_avg_year_context_oa stored as parquet as --create table indi_other_avg_year_context_oa stored as parquet as
with total as --with total as
(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc --(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc
join context c on pc.concept like concat('%',c.id,'%') --join context c on pc.concept like concat('%',c.id,'%')
join otherresearchproduct p on p.id=pc.id --join otherresearchproduct p on p.id=pc.id
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by c.name, year ) --group by c.name, year )
select year, name, round(no_of_pubs/total*100,3) averageofother --select year, name, round(no_of_pubs/total*100,3) averageofother
from total; --from total;
create table indi_other_avg_year_content_oa stored as parquet as --create table indi_other_avg_year_content_oa stored as parquet as
with total as --with total as
(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total --(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
from otherresearchproduct_datasources pd --from otherresearchproduct_datasources pd
join datasource d on datasource=d.id --join datasource d on datasource=d.id
join otherresearchproduct p on p.id=pd.id --join otherresearchproduct p on p.id=pd.id
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by d.type, year) --group by d.type, year)
select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct --select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct
from total; --from total;
create table indi_software_avg_year_content_oa stored as parquet as --create table indi_software_avg_year_content_oa stored as parquet as
with total as --with total as
(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total --(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
from software_datasources pd --from software_datasources pd
join datasource d on datasource=d.id --join datasource d on datasource=d.id
join software p on p.id=pd.id --join software p on p.id=pd.id
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by d.type, year) --group by d.type, year)
select year, type, round(no_of_pubs/total*100,3) averageOfSoftware --select year, type, round(no_of_pubs/total*100,3) averageOfSoftware
from total; --from total;
create table indi_dataset_avg_year_content_oa stored as parquet as --create table indi_dataset_avg_year_content_oa stored as parquet as
with total as --with total as
(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total --(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
from dataset_datasources pd --from dataset_datasources pd
join datasource d on datasource=d.id --join datasource d on datasource=d.id
join dataset p on p.id=pd.id --join dataset p on p.id=pd.id
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by d.type, year) --group by d.type, year)
select year, type, round(no_of_pubs/total*100,3) averageOfDatasets --select year, type, round(no_of_pubs/total*100,3) averageOfDatasets
from total; --from total;
create table indi_pub_avg_year_content_oa stored as parquet as --create table indi_pub_avg_year_content_oa stored as parquet as
with total as --with total as
(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total --(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
from publication_datasources pd --from publication_datasources pd
join datasource d on datasource=d.id --join datasource d on datasource=d.id
join publication p on p.id=pd.id --join publication p on p.id=pd.id
where cast(year as int)>=2003 and cast(year as int)<=2021 --where cast(year as int)>=2003 and cast(year as int)<=2021
group by d.type, year) --group by d.type, year)
select year, type, round(no_of_pubs/total*100,3) averageOfPubs --select year, type, round(no_of_pubs/total*100,3) averageOfPubs
from total; --from total;
create table indi_pub_has_cc_licence stored as parquet as create table indi_pub_has_cc_licence stored as parquet as
select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
@ -231,11 +231,40 @@ join publication_licenses as license on license.id = p.id
WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp
on p.id= tmp.id; on p.id= tmp.id;
-- EOSC-TR1.1-02M:
-- ## Indicator: has_cc_license. Creative Commons licensing has become a
-- de facto standard in scholarly communication and is promoted by many initiatives
-- like Plan S. This indicator might be only useful when applied
-- to openly available publications.
create table indi_pub_has_cc_licence_tr stored as parquet as
select distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_tr
from publication p
left outer join (select p.id, license.type as lic from publication p
join publication_licenses as license on license.id = p.id
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
on p.id= tmp.id
-- #EOSC-F2-01M_cc Rich metadata for scholarly publications
-- ## Indicator: has_cc_license. Creative Commons licensing has become a
-- de facto standard in scholarly communication and is promoted by many initiatives
-- like Plan S. This indicator might be only useful when applied
-- to openly available publications.
-- Same indicator as EOSC-TR1.1-02M (Najko's instructions)
-- create table indi_pub_has_cc_licence_f stored as parquet as
-- select
-- distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_f
-- from publication p
-- left outer join (selectp.id,license.type as lic from publication p
-- join publication_licenses as license on license.id = p.id
-- where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
-- on p.id= tmp.id
create table indi_pub_has_abstract stored as parquet as create table indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract select distinct publication.id, coalesce(abstract, 1) has_abstract
from publication; from publication;
create table indi_with_orcid stored as parquet as create table indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from result r from result r
left outer join (select id, 1 as has_orcid from result_orcid) tmp left outer join (select id, 1 as has_orcid from result_orcid) tmp
@ -270,13 +299,53 @@ join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id where o1.id<>o2.id
group by o1.id, o2.id, o1.type group by o1.id, o2.id, o1.type
create table indi_result_org_country_collab stored as parquet as create table indi_pub_diamond stored as parquet as
with tmp as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
(select o.id as id, o.country , ro.id as result,r.type from organization o from publication_datasources pd
join result_organization ro on o.id=ro.organization left outer join (
join result r on r.id=ro.id where o.country <> 'UNKNOWN') select pd.id, 1 as in_diamond_journal from publication_datasources pd
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations join datasource d on d.id=pd.datasource
from tmp as o1 join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
join tmp as o2 on o1.result=o2.result and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
where o1.id<>o2.id and o1.country<>o2.country on pd.id=tmp.id
group by o1.id, o1.type,o2.country
create table indi_pub_hybrid stored as parquet as
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
from publication_datasources pd
left outer join (
select pd.id, 1 as is_hybrid from publication_datasources pd
join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
on pd.id=tmp.id
create table indi_is_gold_oa stored as parquet as
(select distinct pd.id, coalesce(gold_oa, 0) as gold_oa
from publication_datasources pd
left outer join (
select pd.id, 1 as gold_oa from publication_datasources pd
join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps on (ps.issn_print=d.issn_printed or ps.issn_online=d.issn_online)
where ps.journal_is_in_doaj is true or ps.journal_is_oa is true) tmp
on pd.id=tmp.id)
create table indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from publication pd
left outer join (
select pd.id, 1 as is_transformative from publication_datasources pd
join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and ps.is_transformative_journal=true) tmp
on pd.id=tmp.id
create table indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri
left outer join
(select ri.id, 1 as pub_closed_other_open from result_instance ri
join publication p on p.id=ri.id
join datasource d on ri.hostedby=d.id
where d.type like '%Journal%' and ri.accessright='Closed Access' and
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp
on tmp.id=ri.id