Sprint 5 and other changes

This commit is contained in:
dimitrispie 2021-12-20 19:23:57 +02:00
parent 09fc2afdca
commit c1cdec09a9
1 changed files with 83 additions and 61 deletions

View File

@ -1,28 +1,29 @@
---- Sprint 1 ----
create table indi_pub_green_oa stored as parquet as create table indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa select distinct p.id, coalesce(green_oa, 0) as green_oa
from publication p from publication p
left outer join ( left outer join (
select p.id, 1 as green_oa select p.id, 1 as green_oa
from publication p from publication p
join result_instance ri on ri.id = p.id join result_instance ri on ri.id = p.id
join datasource on datasource.id = ri.hostedby join datasource on datasource.id = ri.hostedby
where datasource.type like '%Repository%' where datasource.type like '%Repository%'
and (ri.accessright = 'Open Access' and (ri.accessright = 'Open Access'
or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp
on p.id= tmp.id; on p.id= tmp.id;
create table indi_pub_grey_lit stored as parquet as create table indi_pub_grey_lit stored as parquet as
select distinct p.id, coalesce(grey_lit, 0) as grey_lit select distinct p.id, coalesce(grey_lit, 0) as grey_lit
from publication p from publication p
left outer join ( left outer join (
select p.id, 1 as grey_lit select p.id, 1 as grey_lit
from publication p from publication p
join result_classifications rt on rt.id = p.id join result_classifications rt on rt.id = p.id
where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and
not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id;
create table indi_pub_doi_from_crossref stored as parquet as create table indi_pub_doi_from_crossref stored as parquet as
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from publication p from publication p
left outer join left outer join
(select ri.id, 1 as doi_from_crossref from result_instance ri (select ri.id, 1 as doi_from_crossref from result_instance ri
@ -33,7 +34,7 @@ on tmp.id=p.id;
create table indi_pub_gold_oa stored as parquet as create table indi_pub_gold_oa stored as parquet as
select distinct p.id, coalesce(gold_oa, 0) as gold_oa select distinct p.id, coalesce(gold_oa, 0) as gold_oa
from publication p from publication p
left outer join ( left outer join (
select p.id, 1 as gold_oa select p.id, 1 as gold_oa
from publication p from publication p
join result_instance ri on ri.id = p.id join result_instance ri on ri.id = p.id
@ -214,82 +215,85 @@ on p.id= tmp.id;
--select year, type, round(no_of_pubs/total*100,3) averageOfPubs --select year, type, round(no_of_pubs/total*100,3) averageOfPubs
--from total; --from total;
create table indi_pub_has_cc_licence stored as parquet as ---- Sprint 2 ----
select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license create table indi_result_has_cc_licence_f stored as parquet as
from publication p select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
left outer join (select p.id, license.type as lic from publication p from result r
join publication_licenses as license on license.id = p.id left outer join (select r.id, license.type as lic from result r
join result_licenses as license on license.id = r.id
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
on p.id= tmp.id; on r.id= tmp.id;
create table indi_pub_has_cc_licence_url stored as parquet as create table indi_result_has_cc_licence_url stored as parquet as
select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from publication p from result r
left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host
from publication p from result r
join publication_licenses as license on license.id = p.id join result_licenses as license on license.id = r.id
WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp
on p.id= tmp.id; on r.id= tmp.id;
-- EOSC-TR1.1-02M: -- EOSC-TR1.1-02M:
-- ## Indicator: has_cc_license. Creative Commons licensing has become a -- ## Indicator: has_cc_license. Creative Commons licensing has become a
-- de facto standard in scholarly communication and is promoted by many initiatives -- de facto standard in scholarly communication and is promoted by many initiatives
-- like Plan S. This indicator might be only useful when applied -- like Plan S. This indicator might be only useful when applied
-- to openly available publications. -- to openly available publications.
--create table indi_pub_has_cc_licence_tr stored as parquet as --create table indi_pub_has_cc_licence_tr stored as parquet as
--select distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_tr --select distinct p.id, case when lic='' or lic is null then 0 else 1 end indi_result_org_collabas has_cc_license_tr
--from publication p --from publication p
--left outer join (select p.id, license.type as lic from publication p --left outer join (select p.id, license.type as lic from publication p
--join publication_licenses as license on license.id = p.id --join publication_licenses as license on license.id = p.id
--where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp --where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
--on p.id= tmp.id --on p.id= tmp.id
-- #EOSC-F2-01M_cc Rich metadata for scholarly publications -- #EOSC-F2-01M_cc Rich metadata for scholarly publications
-- ## Indicator: has_cc_license. Creative Commons licensing has become a -- ## Indicator: has_cc_license. Creative Commons licensing has become a
-- de facto standard in scholarly communication and is promoted by many initiatives -- de facto standard in scholarly communication and is promoted by many initiatives
-- like Plan S. This indicator might be only useful when applied -- like Plan S. This indicator might be only useful when applied
-- to openly available publications. -- to openly available publications.
-- Same indicator as EOSC-TR1.1-02M (Najko's instructions) -- Same indicator as EOSC-TR1.1-02M (Najko's instructions)
-- create table indi_pub_has_cc_licence_f stored as parquet as -- create table indi_pub_has_cc_licence_f stored as parquet as
-- select -- select
-- distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_f -- distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_f
-- from publication p -- from publication p
-- left outer join (selectp.id,license.type as lic from publication p -- left outer join (selectp.id,license.type as lic from publication p
-- join publication_licenses as license on license.id = p.id -- join publication_licenses as license on license.id = p.id
-- where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp -- where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
-- on p.id= tmp.id -- on p.id= tmp.id
create table indi_pub_has_abstract stored as parquet as create table indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract select distinct publication.id, coalesce(abstract, 1) has_abstract
from publication; from publication;
create table indi_result_with_orcid stored as parquet as create table indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from result r from result r
left outer join (select id, 1 as has_orcid from result_orcid) tmp left outer join (select id, 1 as has_orcid from result_orcid) tmp
on r.id= tmp.id
create table indi_funded_result_with_fundref stored as parquet as
select distinct r.id, coalesce(fundref, 0) as fundref
from project_results r
left outer join (select distinct id, 1 as fundref from project_results
where provenance='Harvested') tmp
on r.id= tmp.id on r.id= tmp.id
create table indi_result_org_country_collab stored as parquet as ---- Sprint 3 ----
with tmp as
create table indi_funded_result_with_fundref stored as parquet as
select distinct r.id, coalesce(fundref, 0) as fundref
from project_results r
left outer join (select distinct id, 1 as fundref from project_results
where provenance='Harvested') tmp
on r.id= tmp.id
create table indi_result_org_country_collab stored as parquet as
with tmp as
(select o.id as id, o.country , ro.id as result,r.type from organization o (select o.id as id, o.country , ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id where o.country <> 'UNKNOWN') join result r on r.id=ro.id where o.country <> 'UNKNOWN')
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id and o1.country<>o2.country where o1.id<>o2.id and o1.country<>o2.country
group by o1.id, o1.type,o2.country group by o1.id, o1.type,o2.country
create table indi_result_org_collab stored as parquet as create table indi_result_org_collab stored as parquet as
with tmp as with tmp as
(select o.id, ro.id as result,r.type from organization o (select o.id, ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id) join result r on r.id=ro.id)
@ -299,19 +303,32 @@ join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id where o1.id<>o2.id
group by o1.id, o2.id, o1.type group by o1.id, o2.id, o1.type
create table indi_funder_country_collab stored as parquet as create table indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op
join organization o on o.id=op.id join organization o on o.id=op.id
join project p on p.id=op.project join project p on p.id=op.project
where country <> 'UNKNOWN') where country <> 'UNKNOWN')
select f1.funder, f1.country, f2.country, count(distinct f1.project) as collaborations select f1.funder, f1.country, f2.country, count(distinct f1.project) as collaborations
from tmp as f1 from tmp as f1
join tmp as f2 on f1.project=f2.project join tmp as f2 on f1.project=f2.project
where f1.country<>f2.country where f1.country<>f2.country
group by f1.funder, f2.country, f1.country group by f1.funder, f2.country, f1.country
create table indi_result_country_collab stored as parquet as
with tmp as
(select country, ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id)
select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1
join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country
group by o1.country, o2.country, o1.type
---- Sprint 4 ----
create table indi_pub_diamond stored as parquet as create table indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from publication_datasources pd from publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as in_diamond_journal from publication_datasources pd select pd.id, 1 as in_diamond_journal from publication_datasources pd
@ -321,7 +338,7 @@ and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false)
on pd.id=tmp.id on pd.id=tmp.id
create table indi_pub_hybrid stored as parquet as create table indi_pub_hybrid stored as parquet as
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
from publication_datasources pd from publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as is_hybrid from publication_datasources pd select pd.id, 1 as is_hybrid from publication_datasources pd
@ -331,7 +348,7 @@ and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
on pd.id=tmp.id on pd.id=tmp.id
create table indi_is_gold_oa stored as parquet as create table indi_is_gold_oa stored as parquet as
(select distinct pd.id, coalesce(gold_oa, 0) as gold_oa (select distinct pd.id, coalesce(gold_oa, 0) as gold_oa
from publication_datasources pd from publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as gold_oa from publication_datasources pd select pd.id, 1 as gold_oa from publication_datasources pd
@ -342,7 +359,7 @@ on pd.id=tmp.id)
create table indi_pub_in_transformative stored as parquet as create table indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from publication pd from publication pd
left outer join ( left outer join (
select pd.id, 1 as is_transformative from publication_datasources pd select pd.id, 1 as is_transformative from publication_datasources pd
@ -353,10 +370,15 @@ on pd.id=tmp.id
create table indi_pub_closed_other_open stored as parquet as create table indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri
left outer join left outer join
(select ri.id, 1 as pub_closed_other_open from result_instance ri (select ri.id, 1 as pub_closed_other_open from result_instance ri
join publication p on p.id=ri.id join publication p on p.id=ri.id
join datasource d on ri.hostedby=d.id join datasource d on ri.hostedby=d.id
where d.type like '%Journal%' and ri.accessright='Closed Access' and where d.type like '%Journal%' and ri.accessright='Closed Access' and
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp
on tmp.id=ri.id on tmp.id=ri.id
---- Sprint 5 ----
create table indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from result_instance group by id