2022-08-02 12:39:34 +02:00
-- Sprint 1 ----
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_green_oa stored as parquet as
2021-12-20 18:23:57 +01:00
select distinct p . id , coalesce ( green_oa , 0 ) as green_oa
2021-06-29 15:31:51 +02:00
from publication p
2022-08-02 12:39:34 +02:00
left outer join (
select p . id , 1 as green_oa
from publication p
join result_instance ri on ri . id = p . id
join datasource on datasource . id = ri . hostedby
where datasource . type like ' %Repository% '
and ( ri . accessright = ' Open Access '
or ri . accessright = ' Embargo ' or ri . accessright = ' Open Source ' ) ) tmp
on p . id = tmp . id ;
2021-06-29 15:31:51 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_green_oa COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_grey_lit stored as parquet as
2021-06-29 15:31:51 +02:00
select distinct p . id , coalesce ( grey_lit , 0 ) as grey_lit
from publication p
2022-08-02 12:39:34 +02:00
left outer join (
select p . id , 1 as grey_lit
from publication p
join result_classifications rt on rt . id = p . id
where rt . type not in ( ' Article ' , ' Part of book or chapter of book ' , ' Book ' , ' Doctoral thesis ' , ' Master thesis ' , ' Data Paper ' , ' Thesis ' , ' Bachelor thesis ' , ' Conference object ' ) and
not exists ( select 1 from result_classifications rc where type = ' Other literature type '
and rc . id = p . id ) ) tmp on p . id = tmp . id ;
2021-06-29 15:31:51 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_grey_lit COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_doi_from_crossref stored as parquet as
2021-12-20 18:23:57 +01:00
select distinct p . id , coalesce ( doi_from_crossref , 0 ) as doi_from_crossref
2021-06-29 15:31:51 +02:00
from publication p
2022-08-02 12:39:34 +02:00
left outer join
( select ri . id , 1 as doi_from_crossref from result_instance ri
join datasource d on d . id = ri . collectedfrom
where pidtype = ' Digital Object Identifier ' and d . name = ' Crossref ' ) tmp
on tmp . id = p . id ;
2021-06-29 15:31:51 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_doi_from_crossref COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
-- Sprint 2 ----
2023-01-04 10:39:01 +01:00
create table if not exists indi_result_has_cc_licence stored as parquet as
2021-12-20 18:23:57 +01:00
select distinct r . id , ( case when lic = ' ' or lic is null then 0 else 1 end ) as has_cc_license
from result r
2022-08-02 12:39:34 +02:00
left outer join ( select r . id , license . type as lic from result r
join result_licenses as license on license . id = r . id
where lower ( license . type ) LIKE ' %creativecommons.org% ' OR lower ( license . type ) LIKE ' %cc-% ' ) tmp
on r . id = tmp . id ;
2021-07-24 15:40:28 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_result_has_cc_licence COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_result_has_cc_licence_url stored as parquet as
2021-12-20 18:23:57 +01:00
select distinct r . id , case when lic_host = ' ' or lic_host is null then 0 else 1 end as has_cc_license_url
from result r
2022-08-02 12:39:34 +02:00
left outer join ( select r . id , lower ( parse_url ( license . type , " HOST " ) ) as lic_host
from result r
join result_licenses as license on license . id = r . id
WHERE lower ( parse_url ( license . type , " HOST " ) ) = " creativecommons.org " ) tmp
on r . id = tmp . id ;
2021-07-24 15:40:28 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_result_has_cc_licence_url COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_has_abstract stored as parquet as
select distinct publication . id , cast ( coalesce ( abstract , true ) as int ) has_abstract
2021-10-01 15:02:02 +02:00
from publication ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_has_abstract COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_result_with_orcid stored as parquet as
2021-10-01 15:02:02 +02:00
select distinct r . id , coalesce ( has_orcid , 0 ) as has_orcid
2021-12-20 18:23:57 +01:00
from result r
2022-08-02 12:39:34 +02:00
left outer join ( select id , 1 as has_orcid from result_orcid ) tmp
on r . id = tmp . id ;
2021-12-20 18:23:57 +01:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_result_with_orcid COMPUTE STATISTICS ;
2021-10-01 15:02:02 +02:00
2021-12-21 14:54:38 +01:00
- - -- Sprint 3 ----
2023-01-04 10:39:01 +01:00
create table if not exists indi_funded_result_with_fundref stored as parquet as
2022-09-14 15:36:19 +02:00
select distinct r . result as id , coalesce ( fundref , 0 ) as fundref
2021-12-20 18:23:57 +01:00
from project_results r
2022-09-14 15:36:19 +02:00
left outer join ( select distinct result , 1 as fundref from project_results
2022-08-02 12:39:34 +02:00
where provenance = ' Harvested ' ) tmp
2022-09-14 15:36:19 +02:00
on r . result = tmp . result ;
2021-10-01 15:02:02 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2022-08-05 11:54:36 +02:00
-- create table indi_result_org_collab stored as parquet as
-- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations
-- from result_organization as o1
-- join result_organization as o2 on o1.id=o2.id and o1.organization!=o2.organization
-- group by o1.organization, o2.organization;
--
-- compute stats indi_result_org_collab;
--
2023-01-04 10:39:01 +01:00
create TEMPORARY TABLE tmp AS SELECT ro . organization organization , ro . id from result_organization ro
join organization o on o . id = ro . organization where o . name is not null ;
create table if not exists indi_result_org_collab stored as parquet as
2022-09-22 12:33:07 +02:00
select o1 . organization org1 , o2 . organization org2 , count ( o1 . id ) as collaborations
from tmp as o1
2023-01-04 10:39:01 +01:00
join tmp as o2 where o1 . id = o2 . id and o1 . organization ! = o2 . organization
group by o1 . organization , o2 . organization ;
drop table tmp purge ;
ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS ;
create TEMPORARY TABLE tmp AS
select distinct ro . organization organization , ro . id , o . country from result_organization ro
join organization o on o . id = ro . organization where country < > ' UNKNOWN ' and o . name is not null ;
create table if not exists indi_result_org_country_collab stored as parquet as
2022-09-22 12:33:07 +02:00
select o1 . organization org1 , o2 . country country2 , count ( o1 . id ) as collaborations
from tmp as o1 join tmp as o2 on o1 . id = o2 . id
where o1 . id = o2 . id and o1 . country ! = o2 . country
group by o1 . organization , o1 . id , o2 . country ;
2023-01-04 10:39:01 +01:00
drop table tmp purge ;
2022-09-22 12:33:07 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS ;
2023-05-25 13:52:34 +02:00
create TEMPORARY TABLE AS
select o . id organization , o . name , ro . project as project from organization o
join organization_projects ro on o . id = ro . id ;
2023-01-04 10:39:01 +01:00
create table if not exists indi_project_collab_org stored as parquet as
2023-05-25 13:52:34 +02:00
select o1 . organization org1 , o1 . name orgname1 , o2 . organization org2 , o2 . name orgname2 , count ( distinct o1 . project ) as collaborations
from tmp as o1
join tmp as o2 on o1 . project = o2 . project
where o1 . organization < > o2 . organization and o1 . name < > o2 . name
group by o1 . name , o2 . name , o1 . organization , o2 . organization ;
2022-08-05 12:45:01 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS ;
2022-08-05 12:45:01 +02:00
2023-01-04 10:39:01 +01:00
create TEMPORARY TABLE tmp AS
select o . id organization , o . country , ro . project as project from organization o
2022-08-05 12:45:01 +02:00
join organization_projects ro on o . id = ro . id
2023-01-04 10:39:01 +01:00
and o . country < > ' UNKNOWN ' ;
create table if not exists indi_project_collab_org_country stored as parquet as
2022-08-05 12:45:01 +02:00
select o1 . organization org1 , o2 . country country2 , count ( distinct o1 . project ) as collaborations
from tmp as o1
join tmp as o2 on o1 . project = o2 . project
where o1 . organization < > o2 . organization and o1 . country < > o2 . country
group by o1 . organization , o2 . country ;
2023-01-04 10:39:01 +01:00
drop table tmp purge ;
ANALYZE TABLE indi_project_collab_org_country COMPUTE STATISTICS ;
2022-08-05 12:45:01 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_funder_country_collab stored as parquet as
2022-08-05 12:45:01 +02:00
with tmp as ( select funder , project , country from organization_projects op
join organization o on o . id = op . id
join project p on p . id = op . project
where country < > ' UNKNOWN ' )
select f1 . funder , f1 . country as country1 , f2 . country as country2 , count ( distinct f1 . project ) as collaborations
from tmp as f1
join tmp as f2 on f1 . project = f2 . project
where f1 . country < > f2 . country
group by f1 . funder , f2 . country , f1 . country ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_funder_country_collab COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create TEMPORARY TABLE tmp AS
select distinct country , ro . id as result from organization o
2022-09-22 12:33:07 +02:00
join result_organization ro on o . id = ro . organization
2023-01-04 10:39:01 +01:00
where country < > ' UNKNOWN ' and o . name is not null ;
create table if not exists indi_result_country_collab stored as parquet as
2022-09-22 12:33:07 +02:00
select o1 . country country1 , o2 . country country2 , count ( o1 . result ) as collaborations
from tmp as o1
join tmp as o2 on o1 . result = o2 . result
where o1 . country < > o2 . country
group by o1 . country , o2 . country ;
2023-01-04 10:39:01 +01:00
drop table tmp purge ;
ANALYZE TABLE indi_result_country_collab COMPUTE STATISTICS ;
2022-09-22 12:33:07 +02:00
2021-12-20 18:23:57 +01:00
- - -- Sprint 4 ----
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_diamond stored as parquet as
2021-12-20 18:23:57 +01:00
select distinct pd . id , coalesce ( in_diamond_journal , 0 ) as in_diamond_journal
2021-11-26 14:22:04 +01:00
from publication_datasources pd
2022-08-02 12:39:34 +02:00
left outer join (
select pd . id , 1 as in_diamond_journal from publication_datasources pd
join datasource d on d . id = pd . datasource
2023-02-20 08:29:20 +01:00
join STATS_EXT . plan_s_jn ps where ( ps . issn_print = d . issn_printed and ps . issn_online = d . issn_online )
2022-08-02 12:39:34 +02:00
and ( ps . journal_is_in_doaj = true or ps . journal_is_oa = true ) and ps . has_apc = false ) tmp
on pd . id = tmp . id ;
2021-11-26 14:22:04 +01:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_diamond COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_in_transformative stored as parquet as
2021-12-20 18:23:57 +01:00
select distinct pd . id , coalesce ( is_transformative , 0 ) as is_transformative
2021-11-26 14:22:04 +01:00
from publication pd
2022-08-02 12:39:34 +02:00
left outer join (
select pd . id , 1 as is_transformative from publication_datasources pd
join datasource d on d . id = pd . datasource
2023-02-20 08:29:20 +01:00
join STATS_EXT . plan_s_jn ps where ( ps . issn_print = d . issn_printed and ps . issn_online = d . issn_online )
2022-08-02 12:39:34 +02:00
and ps . is_transformative_journal = true ) tmp
on pd . id = tmp . id ;
2021-11-26 14:22:04 +01:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_in_transformative COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_closed_other_open stored as parquet as
2021-11-26 14:22:04 +01:00
select distinct ri . id , coalesce ( pub_closed_other_open , 0 ) as pub_closed_other_open from result_instance ri
2022-08-02 12:39:34 +02:00
left outer join
( select ri . id , 1 as pub_closed_other_open from result_instance ri
join publication p on p . id = ri . id
join datasource d on ri . hostedby = d . id
where d . type like ' %Journal% ' and ri . accessright = ' Closed Access ' and
( p . bestlicence = ' Open Access ' or p . bestlicence = ' Open Source ' ) ) tmp
on tmp . id = ri . id ;
2021-12-20 18:23:57 +01:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_closed_other_open COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2021-12-21 14:54:38 +01:00
- - -- Sprint 5 ----
2023-01-04 10:39:01 +01:00
create table if not exists indi_result_no_of_copies stored as parquet as
2022-02-17 09:21:09 +01:00
select id , count ( id ) as number_of_copies from result_instance group by id ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2022-08-02 12:39:34 +02:00
- - -- Sprint 6 ----
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
2022-08-02 12:39:34 +02:00
WITH hybrid_oa AS (
SELECT issn_l , journal_is_in_doaj , journal_is_oa , issn_print as issn
2023-02-20 08:29:20 +01:00
FROM STATS_EXT . plan_s_jn
2022-08-02 12:39:34 +02:00
WHERE issn_print ! = " "
UNION ALL
SELECT issn_l , journal_is_in_doaj , journal_is_oa , issn_online as issn
2023-02-20 08:29:20 +01:00
FROM STATS_EXT . plan_s_jn
2022-08-02 12:39:34 +02:00
WHERE issn_online ! = " " and ( journal_is_in_doaj = FALSE OR journal_is_oa = FALSE ) ) ,
issn AS (
SELECT *
FROM (
SELECT id , issn_printed as issn
FROM datasource
WHERE issn_printed IS NOT NULL
2023-01-04 10:39:01 +01:00
UNION ALL
2022-08-02 12:39:34 +02:00
SELECT id , issn_online as issn
FROM datasource
WHERE issn_online IS NOT NULL ) as issn
2022-02-18 16:11:23 +01:00
WHERE LENGTH ( issn ) > 7 )
2022-02-17 09:21:09 +01:00
SELECT DISTINCT pd . id , coalesce ( is_hybrid_oa , 0 ) as is_hybrid_oa
FROM publication_datasources pd
2022-08-02 12:39:34 +02:00
LEFT OUTER JOIN (
2022-02-18 16:11:23 +01:00
SELECT pd . id , 1 as is_hybrid_oa from publication_datasources pd
2022-08-02 12:39:34 +02:00
JOIN datasource d on d . id = pd . datasource
JOIN issn on issn . id = pd . datasource
JOIN hybrid_oa ON issn . issn = hybrid_oa . issn
JOIN indi_result_has_cc_licence cc on pd . id = cc . id
2023-05-25 13:52:34 +02:00
JOIN indi_pub_gold_oa ga on pd . id = ga . id
where cc . has_cc_license = 1 and ga . is_gold = 0 ) tmp on pd . id = tmp . id ;
2022-02-18 16:11:23 +01:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-05-25 13:52:34 +02:00
create table if not exists indi_pub_bronze_oa stored as parquet as
WITH hybrid_oa AS (
SELECT issn_l , journal_is_in_doaj , journal_is_oa , issn_print as issn
FROM STATS_EXT . plan_s_jn
WHERE issn_print ! = " "
UNION ALL
SELECT issn_l , journal_is_in_doaj , journal_is_oa , issn_online as issn
FROM STATS_EXT . plan_s_jn
WHERE issn_online ! = " " and ( journal_is_in_doaj = FALSE OR journal_is_oa = FALSE ) ) ,
issn AS (
SELECT *
FROM (
SELECT id , issn_printed as issn
FROM datasource
WHERE issn_printed IS NOT NULL
UNION ALL
SELECT id , issn_online as issn
FROM datasource
WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH ( issn ) > 7 )
SELECT DISTINCT pd . id , coalesce ( is_bronze_oa , 0 ) as is_hybrid_oa
FROM publication_datasources pd
LEFT OUTER JOIN (
SELECT pd . id , 1 as is_bronze_oa from publication_datasources pd
JOIN datasource d on d . id = pd . datasource
JOIN issn on issn . id = pd . datasource
JOIN hybrid_oa ON issn . issn = hybrid_oa . issn
JOIN indi_result_has_cc_licence cc on pd . id = cc . id
JOIN indi_pub_gold_oa ga on pd . id = ga . id
JOIN indi_pub_hybrid_oa_with_cc hy on hy . id = pd . id
where cc . has_cc_license = 0 and ga . is_gold = 0 and hy . is_hybrid_oa = 0 ) tmp on pd . id = tmp . id ;
ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS ;
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_downloads stored as parquet as
2022-09-28 13:36:57 +02:00
SELECT result_id , sum ( downloads ) no_downloads from openaire_prod_usage_stats . usage_stats
2022-08-02 12:39:34 +02:00
join publication on result_id = id
2022-02-18 16:11:23 +01:00
where downloads > 0
GROUP BY result_id
2022-09-28 13:36:57 +02:00
order by no_downloads desc ;
2022-02-18 16:11:23 +01:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_downloads COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_downloads_datasource stored as parquet as
2022-09-28 13:36:57 +02:00
SELECT result_id , repository_id , sum ( downloads ) no_downloads from openaire_prod_usage_stats . usage_stats
2022-08-02 12:39:34 +02:00
join publication on result_id = id
2022-02-18 16:11:23 +01:00
where downloads > 0
GROUP BY result_id , repository_id
order by result_id ;
2022-02-17 09:21:09 +01:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_downloads_datasource COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_downloads_year stored as parquet as
SELECT result_id , substring ( us . ` date ` , 1 , 4 ) as ` year ` , sum ( downloads ) no_downloads
from openaire_prod_usage_stats . usage_stats us
join publication on result_id = id where downloads > 0
GROUP BY result_id , substring ( us . ` date ` , 1 , 4 ) ;
2022-02-17 09:21:09 +01:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_downloads_year COMPUTE STATISTICS ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_downloads_datasource_year stored as parquet as
2022-09-28 13:36:57 +02:00
SELECT result_id , substring ( us . ` date ` , 1 , 4 ) as ` year ` , repository_id , sum ( downloads ) no_downloads from openaire_prod_usage_stats . usage_stats us
2023-01-04 10:39:01 +01:00
join publication on result_id = id
2022-02-18 16:11:23 +01:00
where downloads > 0
2023-01-04 10:39:01 +01:00
GROUP BY result_id , repository_id , substring ( us . ` date ` , 1 , 4 ) ;
2022-04-06 11:40:02 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_downloads_datasource_year COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
- - -- Sprint 7 ----
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_gold_oa stored as parquet as
2022-08-02 12:39:34 +02:00
WITH gold_oa AS ( SELECT
issn_l ,
journal_is_in_doaj ,
journal_is_oa ,
issn_1 as issn
FROM
2023-02-20 08:29:20 +01:00
STATS_EXT . oa_journals
2022-08-02 12:39:34 +02:00
WHERE
issn_1 ! = " "
UNION
ALL SELECT
issn_l ,
journal_is_in_doaj ,
journal_is_oa ,
issn_2 as issn
FROM
2023-02-20 08:29:20 +01:00
STATS_EXT . oa_journals
2022-08-02 12:39:34 +02:00
WHERE
issn_2 ! = " " ) , issn AS ( SELECT
*
FROM
( SELECT
id ,
issn_printed as issn
FROM
datasource
WHERE
issn_printed IS NOT NULL
2023-01-04 10:39:01 +01:00
UNION ALL
2022-08-02 12:39:34 +02:00
SELECT
id ,
issn_online as issn
FROM
datasource
WHERE
issn_online IS NOT NULL or id like ' %doajarticles% ' ) as issn
WHERE
LENGTH ( issn ) > 7 )
SELECT
DISTINCT pd . id , coalesce ( is_gold , 0 ) as is_gold
FROM
publication_datasources pd
left outer join (
select pd . id , 1 as is_gold FROM publication_datasources pd
JOIN issn on issn . id = pd . datasource
JOIN gold_oa on issn . issn = gold_oa . issn ) tmp
on pd . id = tmp . id ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_hybrid stored as parquet as
2022-08-02 12:39:34 +02:00
WITH gold_oa AS ( SELECT
issn_l ,
journal_is_in_doaj ,
journal_is_oa ,
issn_1 as issn ,
has_apc
FROM
2023-02-20 08:29:20 +01:00
STATS_EXT . oa_journals
2022-08-02 12:39:34 +02:00
WHERE
issn_1 ! = " "
UNION
ALL SELECT
issn_l ,
journal_is_in_doaj ,
journal_is_oa ,
issn_2 as issn ,
has_apc
FROM
2023-02-20 08:29:20 +01:00
STATS_EXT . oa_journals
2022-08-02 12:39:34 +02:00
WHERE
issn_2 ! = " " ) , issn AS ( SELECT
*
FROM
( SELECT
id ,
issn_printed as issn
FROM
datasource
WHERE
issn_printed IS NOT NULL
2023-01-04 10:39:01 +01:00
UNION ALL
2022-08-02 12:39:34 +02:00
SELECT
id ,
issn_online as issn
FROM
datasource
WHERE
issn_online IS NOT NULL or id like ' %doajarticles% ' ) as issn
WHERE
LENGTH ( issn ) > 7 )
select distinct pd . id , coalesce ( is_hybrid , 0 ) as is_hybrid
from publication_datasources pd
left outer join (
select pd . id , 1 as is_hybrid from publication_datasources pd
join datasource d on d . id = pd . datasource
join issn on issn . id = pd . datasource
join gold_oa on issn . issn = gold_oa . issn
where ( gold_oa . journal_is_in_doaj = false or gold_oa . journal_is_oa = false ) ) tmp
on pd . id = tmp . id ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_hybrid COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_org_fairness stored as parquet as
2022-08-02 12:39:34 +02:00
- - return results with PIDs , and rich metadata group by organization
with result_fair as
( select ro . organization organization , count ( distinct ro . id ) no_result_fair from result_organization ro
join result r on r . id = ro . id
- - join result_pids rp on r . id = rp . id
2023-01-04 10:39:01 +01:00
where ( title is not null ) and ( publisher is not null ) and ( abstract = true ) and ( year is not null ) and ( authors > 0 ) and cast ( year as int ) > 2003
2022-08-02 12:39:34 +02:00
group by ro . organization ) ,
- - return all results group by organization
allresults as ( select organization , count ( distinct ro . id ) no_allresults from result_organization ro
join result r on r . id = ro . id
2022-09-09 12:15:58 +02:00
where cast ( year as int ) > 2003
2022-08-02 12:39:34 +02:00
group by organization )
- - return results_fair / all_results
select allresults . organization , result_fair . no_result_fair / allresults . no_allresults org_fairness
from allresults
join result_fair on result_fair . organization = allresults . organization ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_org_fairness COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_org_fairness_pub_pr stored as parquet as
2022-08-02 12:39:34 +02:00
with result_fair as
( select ro . organization organization , count ( distinct ro . id ) no_result_fair
from result_organization ro
join publication p on p . id = ro . id
join indi_pub_doi_from_crossref dc on dc . id = p . id
join indi_pub_grey_lit gl on gl . id = p . id
2023-01-04 10:39:01 +01:00
where ( title is not null ) and ( publisher is not null ) and ( abstract = true ) and ( year is not null )
2022-08-02 12:39:34 +02:00
and ( authors > 0 ) and cast ( year as int ) > 2003 and dc . doi_from_crossref = 1 and gl . grey_lit = 0
group by ro . organization ) ,
allresults as ( select organization , count ( distinct ro . id ) no_allresults from result_organization ro
join publication p on p . id = ro . id
where cast ( year as int ) > 2003
group by organization )
- - return results_fair / all_results
select allresults . organization , result_fair . no_result_fair / allresults . no_allresults org_fairness
from allresults
join result_fair on result_fair . organization = allresults . organization ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_org_fairness_pub_pr COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
CREATE TEMPORARY table result_fair as
select year , ro . organization organization , count ( distinct ro . id ) no_result_fair from result_organization ro
join result p on p . id = ro . id
where ( title is not null ) and ( publisher is not null ) and ( abstract = true ) and ( year is not null ) and ( authors > 0 ) and cast ( year as int ) > 2003
group by ro . organization , year ;
CREATE TEMPORARY TABLE allresults as select year , organization , count ( distinct ro . id ) no_allresults from result_organization ro
join result p on p . id = ro . id
2022-08-02 12:39:34 +02:00
where cast ( year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by organization , year ;
create table if not exists indi_org_fairness_pub_year stored as parquet as
2022-08-02 12:39:34 +02:00
select allresults . year , allresults . organization , result_fair . no_result_fair / allresults . no_allresults org_fairness
from allresults
join result_fair on result_fair . organization = allresults . organization and result_fair . year = allresults . year ;
2023-01-04 10:39:01 +01:00
DROP table result_fair purge ;
DROP table allresults purge ;
ANALYZE TABLE indi_org_fairness_pub_year COMPUTE STATISTICS ;
CREATE TEMPORARY TABLE result_fair as
select ro . organization organization , count ( distinct ro . id ) no_result_fair
from result_organization ro
join result p on p . id = ro . id
where ( title is not null ) and ( publisher is not null ) and ( abstract = true ) and ( year is not null )
and ( authors > 0 ) and cast ( year as int ) > 2003
group by ro . organization ;
CREATE TEMPORARY TABLE allresults as
select organization , count ( distinct ro . id ) no_allresults from result_organization ro
join result p on p . id = ro . id
where cast ( year as int ) > 2003
group by organization ;
create table if not exists indi_org_fairness_pub as
2022-08-02 12:39:34 +02:00
select allresults . organization , result_fair . no_result_fair / allresults . no_allresults org_fairness
2023-01-04 10:39:01 +01:00
from allresults join result_fair on result_fair . organization = allresults . organization ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
DROP table result_fair purge ;
DROP table allresults purge ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_org_fairness_pub COMPUTE STATISTICS ;
CREATE TEMPORARY TABLE result_fair as
select year , ro . organization organization , count ( distinct ro . id ) no_result_fair from result_organization ro
2022-08-02 12:39:34 +02:00
join result r on r . id = ro . id
join result_pids rp on r . id = rp . id
2023-01-04 10:39:01 +01:00
where ( title is not null ) and ( publisher is not null ) and ( abstract = true ) and ( year is not null ) and ( authors > 0 ) and cast ( year as int ) > 2003
group by ro . organization , year ;
CREATE TEMPORARY TABLE allresults as
select year , organization , count ( distinct ro . id ) no_allresults from result_organization ro
2022-08-02 12:39:34 +02:00
join result r on r . id = ro . id
2022-09-09 12:15:58 +02:00
where cast ( year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by organization , year ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_org_fairness_year stored as parquet as
select allresults . year , allresults . organization , result_fair . no_result_fair / allresults . no_allresults org_fairness
from allresults
join result_fair on result_fair . organization = allresults . organization and result_fair . year = allresults . year ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
DROP table result_fair purge ;
DROP table allresults purge ;
ANALYZE TABLE indi_org_fairness_year COMPUTE STATISTICS ;
CREATE TEMPORARY TABLE result_with_pid as
select year , ro . organization organization , count ( distinct rp . id ) no_result_with_pid from result_organization ro
2022-08-02 12:39:34 +02:00
join result_pids rp on rp . id = ro . id
join result r on r . id = rp . id
2022-09-09 12:15:58 +02:00
where cast ( year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization , year ;
CREATE TEMPORARY TABLE allresults as
select year , organization , count ( distinct ro . id ) no_allresults from result_organization ro
2022-08-02 12:39:34 +02:00
join result r on r . id = ro . id
2022-09-09 12:15:58 +02:00
where cast ( year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by organization , year ;
create table if not exists indi_org_findable_year stored as parquet as
2022-08-02 12:39:34 +02:00
select allresults . year , allresults . organization , result_with_pid . no_result_with_pid / allresults . no_allresults org_findable
from allresults
join result_with_pid on result_with_pid . organization = allresults . organization and result_with_pid . year = allresults . year ;
2023-01-04 10:39:01 +01:00
DROP table result_with_pid purge ;
DROP table allresults purge ;
ANALYZE TABLE indi_org_findable_year COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
CREATE TEMPORARY TABLE result_with_pid as
select ro . organization organization , count ( distinct rp . id ) no_result_with_pid from result_organization ro
2022-08-02 12:39:34 +02:00
join result_pids rp on rp . id = ro . id
join result r on r . id = rp . id
2022-09-09 12:15:58 +02:00
where cast ( year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization ;
CREATE TEMPORARY TABLE allresults as
select organization , count ( distinct ro . id ) no_allresults from result_organization ro
2022-08-02 12:39:34 +02:00
join result r on r . id = ro . id
2022-09-09 12:15:58 +02:00
where cast ( year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by organization ;
create table if not exists indi_org_findable stored as parquet as
2022-08-02 12:39:34 +02:00
select allresults . organization , result_with_pid . no_result_with_pid / allresults . no_allresults org_findable
from allresults
join result_with_pid on result_with_pid . organization = allresults . organization ;
2023-01-04 10:39:01 +01:00
DROP table result_with_pid purge ;
DROP table allresults purge ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_org_findable COMPUTE STATISTICS ;
CREATE TEMPORARY TABLE pubs_oa as
SELECT ro . organization , count ( distinct r . id ) no_oapubs FROM publication r
2022-08-02 12:39:34 +02:00
join result_organization ro on ro . id = r . id
join result_instance ri on ri . id = r . id
where ( ri . accessright = ' Open Access ' or ri . accessright = ' Embargo ' or ri . accessright = ' Open Source ' )
and cast ( r . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization ;
CREATE TEMPORARY TABLE datasets_oa as
SELECT ro . organization , count ( distinct r . id ) no_oadatasets FROM dataset r
2022-08-02 12:39:34 +02:00
join result_organization ro on ro . id = r . id
join result_instance ri on ri . id = r . id
where ( ri . accessright = ' Open Access ' or ri . accessright = ' Embargo ' or ri . accessright = ' Open Source ' )
and cast ( r . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization ;
CREATE TEMPORARY TABLE software_oa as
SELECT ro . organization , count ( distinct r . id ) no_oasoftware FROM software r
2022-08-02 12:39:34 +02:00
join result_organization ro on ro . id = r . id
join result_instance ri on ri . id = r . id
where ( ri . accessright = ' Open Access ' or ri . accessright = ' Embargo ' or ri . accessright = ' Open Source ' )
and cast ( r . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization ;
CREATE TEMPORARY TABLE allpubs as
SELECT ro . organization organization , count ( ro . id ) no_allpubs FROM result_organization ro
2022-08-02 12:39:34 +02:00
join publication ps on ps . id = ro . id
where cast ( ps . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization ;
CREATE TEMPORARY TABLE alldatasets as
SELECT ro . organization organization , count ( ro . id ) no_alldatasets FROM result_organization ro
2022-08-02 12:39:34 +02:00
join dataset ps on ps . id = ro . id
where cast ( ps . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization ;
CREATE TEMPORARY TABLE allsoftware as
SELECT ro . organization organization , count ( ro . id ) no_allsoftware FROM result_organization ro
2022-08-02 12:39:34 +02:00
join software ps on ps . id = ro . id
where cast ( ps . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization ;
CREATE TEMPORARY TABLE allpubsshare as
select pubs_oa . organization , pubs_oa . no_oapubs / allpubs . no_allpubs p from allpubs
join pubs_oa on allpubs . organization = pubs_oa . organization ;
CREATE TEMPORARY TABLE alldatasetssshare as
select datasets_oa . organization , datasets_oa . no_oadatasets / alldatasets . no_alldatasets d
2022-08-02 12:39:34 +02:00
from alldatasets
2023-01-04 10:39:01 +01:00
join datasets_oa on alldatasets . organization = datasets_oa . organization ;
CREATE TEMPORARY TABLE allsoftwaresshare as
select software_oa . organization , software_oa . no_oasoftware / allsoftware . no_allsoftware s
2022-08-02 12:39:34 +02:00
from allsoftware
2023-01-04 10:39:01 +01:00
join software_oa on allsoftware . organization = software_oa . organization ;
create table if not exists indi_org_openess stored as parquet as
2022-08-02 12:39:34 +02:00
select allpubsshare . organization ,
2023-01-04 10:39:01 +01:00
( p + if ( isnull ( s ) , 0 , s ) + if ( isnull ( d ) , 0 , d ) ) / ( 1 + ( case when s is null then 0 else 1 end )
2022-08-02 12:39:34 +02:00
+ ( case when d is null then 0 else 1 end ) )
org_openess FROM allpubsshare
left outer join ( select organization , d from
alldatasetssshare ) tmp1
on tmp1 . organization = allpubsshare . organization
left outer join ( select organization , s from
allsoftwaresshare ) tmp2
on tmp2 . organization = allpubsshare . organization ;
2023-01-04 10:39:01 +01:00
DROP TABLE pubs_oa purge ;
DROP TABLE datasets_oa purge ;
DROP TABLE software_oa purge ;
DROP TABLE allpubs purge ;
DROP TABLE alldatasets purge ;
DROP TABLE allsoftware purge ;
DROP TABLE allpubsshare purge ;
DROP TABLE alldatasetssshare purge ;
DROP TABLE allsoftwaresshare purge ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_org_openess COMPUTE STATISTICS ;
CREATE TEMPORARY TABLE pubs_oa AS
SELECT r . year , ro . organization , count ( distinct r . id ) no_oapubs FROM publication r
2022-08-02 12:39:34 +02:00
join result_organization ro on ro . id = r . id
join result_instance ri on ri . id = r . id
where ( ri . accessright = ' Open Access ' or ri . accessright = ' Embargo ' or ri . accessright = ' Open Source ' )
and cast ( r . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization , r . year ;
CREATE TEMPORARY TABLE datasets_oa AS
SELECT r . year , ro . organization , count ( distinct r . id ) no_oadatasets FROM dataset r
2022-08-02 12:39:34 +02:00
join result_organization ro on ro . id = r . id
join result_instance ri on ri . id = r . id
where ( ri . accessright = ' Open Access ' or ri . accessright = ' Embargo ' or ri . accessright = ' Open Source ' )
and cast ( r . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization , r . year ;
CREATE TEMPORARY TABLE software_oa AS
SELECT r . year , ro . organization , count ( distinct r . id ) no_oasoftware FROM software r
2022-08-02 12:39:34 +02:00
join result_organization ro on ro . id = r . id
join result_instance ri on ri . id = r . id
where ( ri . accessright = ' Open Access ' or ri . accessright = ' Embargo ' or ri . accessright = ' Open Source ' )
and cast ( r . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization , r . year ;
CREATE TEMPORARY TABLE allpubs as
SELECT p . year , ro . organization organization , count ( ro . id ) no_allpubs FROM result_organization ro
2022-08-02 12:39:34 +02:00
join publication p on p . id = ro . id where cast ( p . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization , p . year ;
CREATE TEMPORARY TABLE alldatasets as
SELECT d . year , ro . organization organization , count ( ro . id ) no_alldatasets FROM result_organization ro
2022-08-02 12:39:34 +02:00
join dataset d on d . id = ro . id where cast ( d . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization , d . year ;
CREATE TEMPORARY TABLE allsoftware as
SELECT s . year , ro . organization organization , count ( ro . id ) no_allsoftware FROM result_organization ro
2022-08-02 12:39:34 +02:00
join software s on s . id = ro . id where cast ( s . year as int ) > 2003
2023-01-04 10:39:01 +01:00
group by ro . organization , s . year ;
CREATE TEMPORARY TABLE allpubsshare as
select allpubs . year , pubs_oa . organization , pubs_oa . no_oapubs / allpubs . no_allpubs p from allpubs
join pubs_oa on allpubs . organization = pubs_oa . organization where cast ( allpubs . year as INT ) = cast ( pubs_oa . year as int ) ;
CREATE TEMPORARY TABLE alldatasetssshare as
select alldatasets . year , datasets_oa . organization , datasets_oa . no_oadatasets / alldatasets . no_alldatasets d
2022-08-02 12:39:34 +02:00
from alldatasets
2023-01-04 10:39:01 +01:00
join datasets_oa on alldatasets . organization = datasets_oa . organization where cast ( alldatasets . year as INT ) = cast ( datasets_oa . year as int ) ;
CREATE TEMPORARY TABLE allsoftwaresshare as
select allsoftware . year , software_oa . organization , software_oa . no_oasoftware / allsoftware . no_allsoftware s
2022-08-02 12:39:34 +02:00
from allsoftware
2023-01-04 10:39:01 +01:00
join software_oa on allsoftware . organization = software_oa . organization where cast ( allsoftware . year as INT ) = cast ( software_oa . year as int ) ;
create table if not exists indi_org_openess_year stored as parquet as
2022-08-02 12:39:34 +02:00
select allpubsshare . year , allpubsshare . organization ,
2023-01-04 10:39:01 +01:00
( p + if ( isnull ( s ) , 0 , s ) + if ( isnull ( d ) , 0 , d ) ) / ( 1 + ( case when s is null then 0 else 1 end )
2022-08-02 12:39:34 +02:00
+ ( case when d is null then 0 else 1 end ) )
org_openess FROM allpubsshare
left outer join ( select year , organization , d from
alldatasetssshare ) tmp1
on tmp1 . organization = allpubsshare . organization and tmp1 . year = allpubsshare . year
left outer join ( select year , organization , s from
allsoftwaresshare ) tmp2
on tmp2 . organization = allpubsshare . organization and tmp2 . year = allpubsshare . year ;
2023-01-04 10:39:01 +01:00
DROP TABLE pubs_oa purge ;
DROP TABLE datasets_oa purge ;
DROP TABLE software_oa purge ;
DROP TABLE allpubs purge ;
DROP TABLE alldatasets purge ;
DROP TABLE allsoftware purge ;
DROP TABLE allpubsshare purge ;
DROP TABLE alldatasetssshare purge ;
DROP TABLE allsoftwaresshare purge ;
ANALYZE TABLE indi_org_openess_year COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_has_preprint stored as parquet as
2022-08-02 12:39:34 +02:00
select distinct p . id , coalesce ( has_preprint , 0 ) as has_preprint
from publication_classifications p
left outer join (
select p . id , 1 as has_preprint
from publication_classifications p
where p . type = ' Preprint ' ) tmp
on p . id = tmp . id ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_has_preprint COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_pub_in_subscribed stored as parquet as
2022-08-02 12:39:34 +02:00
select distinct p . id , coalesce ( is_subscription , 0 ) as is_subscription
from publication p
left outer join (
select p . id , 1 as is_subscription from publication p
join indi_pub_gold_oa g on p . id = g . id
join indi_pub_hybrid h on p . id = h . id
join indi_pub_in_transformative t on p . id = t . id
where g . is_gold = 0 and h . is_hybrid = 0 and t . is_transformative = 0 ) tmp
on p . id = tmp . id ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_pub_in_subscribed COMPUTE STATISTICS ;
2022-08-02 12:39:34 +02:00
2023-01-04 10:39:01 +01:00
create table if not exists indi_result_with_pid as
2022-08-02 12:39:34 +02:00
select distinct p . id , coalesce ( result_with_pid , 0 ) as result_with_pid
from result p
left outer join (
select p . id , 1 as result_with_pid
from result_pids p ) tmp
on p . id = tmp . id ;
2023-01-04 10:39:01 +01:00
ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS ;
2023-05-26 09:25:28 +02:00
create table if not exists indi_impact_measures as
select distinct substr ( id , 4 ) , measures_ids . id impactmetric , measures_ids . unit . value [ 0 ] score ,
cast ( measures_ids . unit . value [ 0 ] as decimal ( 6 , 3 ) ) score_dec , measures_ids . unit . value [ 1 ] class
from result lateral view explode ( measures ) measures as measures_ids
where measures_ids . id ! = ' views ' and measures_ids . id ! = ' downloads ' ;
ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS ;