Merge pull request '[stats wf] latest version of indicators + added FOS classification' (#232) from antonis.lempesis/dnet-hadoop:beta into beta

Reviewed-on: D-Net/dnet-hadoop#232
This commit is contained in:
Claudio Atzori 2022-08-02 12:57:15 +02:00
commit 27681cf6bf
4 changed files with 734 additions and 288 deletions

View File

@ -1,4 +1,4 @@
---- Sprint 1 ---- -- Sprint 1 ----
create table indi_pub_green_oa stored as parquet as create table indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa select distinct p.id, coalesce(green_oa, 0) as green_oa
from publication p from publication p
@ -22,7 +22,8 @@ select p.id, 1 as grey_lit
from publication p from publication p
join result_classifications rt on rt.id = p.id join result_classifications rt on rt.id = p.id
where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and
not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; not exists (select 1 from result_classifications rc where type ='Other literature type'
and rc.id=p.id)) tmp on p.id=tmp.id;
compute stats indi_pub_grey_lit; compute stats indi_pub_grey_lit;
@ -36,7 +37,8 @@ where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp
on tmp.id=p.id; on tmp.id=p.id;
compute stats indi_pub_doi_from_crossref; compute stats indi_pub_doi_from_crossref;
---- Sprint 2 ----
-- Sprint 2 ----
create table indi_result_has_cc_licence stored as parquet as create table indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from result r from result r
@ -82,31 +84,61 @@ on r.id= tmp.id;
compute stats indi_funded_result_with_fundref; compute stats indi_funded_result_with_fundref;
-- create table indi_result_org_country_collab stored as parquet as create table indi_result_org_collab stored as parquet as
-- with tmp as select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations
-- (select o.id as id, o.country , ro.id as result,r.type from organization o from result_organization as o1
-- join result_organization ro on o.id=ro.organization join result_organization as o2 on o1.id=o2.id and o1.organization!=o2.organization
-- join result r on r.id=ro.id where o.country <> 'UNKNOWN') group by o1.organization, o2.organization;
-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
-- from tmp as o1
-- join tmp as o2 on o1.result=o2.result
-- where o1.id<>o2.id and o1.country<>o2.country
-- group by o1.id, o1.type,o2.country;
--
-- compute stats indi_result_org_country_collab;
-- create table indi_result_org_collab stored as parquet as compute stats indi_result_org_collab;
-- with tmp as
-- (select o.id, ro.id as result,r.type from organization o create table indi_result_org_country_collab stored as parquet as
-- join result_organization ro on o.id=ro.organization with tmp as
-- join result r on r.id=ro.id) (select o.id as id, o.country , ro.id as result,r.type from organization o
-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations join result_organization ro on o.id=ro.organization
-- from tmp as o1 join result r on r.id=ro.id where o.country <> 'UNKNOWN')
-- join tmp as o2 on o1.result=o2.result select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
-- where o1.id<>o2.id from tmp as o1
-- group by o1.id, o2.id, o1.type; join tmp as o2 on o1.result=o2.result
-- where o1.id<>o2.id and o1.country<>o2.country
-- compute stats indi_result_org_collab; group by o1.id, o1.type,o2.country;
compute stats indi_result_org_country_collab;
create table indi_result_org_collab stored as parquet as
with tmp as
(select o.id, ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id)
select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1
join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id
group by o1.id, o2.id, o1.type;
compute stats indi_result_org_collab;
create table indi_project_collab_org stored as parquet as
select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations
from organization_projects as o1
join organization_projects as o2 on o1.project=o2.project
where o1.id!=o2.id
group by o1.id, o2.id;
compute stats indi_project_collab_org;
create table indi_project_collab_org_country stored as parquet as
with tmp as
(select o.id organization, o.country , ro.project as project from organization o
join organization_projects ro on o.id=ro.id
and o.country <> 'UNKNOWN')
select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1
join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country
group by o1.organization, o2.country;
compute stats indi_project_collab_org_country;
create table indi_funder_country_collab stored as parquet as create table indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op
@ -125,7 +157,7 @@ create table indi_result_country_collab stored as parquet as
with tmp as with tmp as
(select country, ro.id as result,r.type from organization o (select country, ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id) join result r on r.id=ro.id where country <> 'UNKNOWN')
select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
@ -182,176 +214,178 @@ where d.type like '%Journal%' and ri.accessright='Closed Access' and
on tmp.id=ri.id; on tmp.id=ri.id;
compute stats indi_pub_closed_other_open; compute stats indi_pub_closed_other_open;
---- Sprint 5 ---- ---- Sprint 5 ----
create table indi_result_no_of_copies stored as parquet as create table indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from result_instance group by id; select id, count(id) as number_of_copies from result_instance group by id;
compute stats indi_result_no_of_copies; compute stats indi_result_no_of_copies;
---- Sprint 6 ---- ---- Sprint 6 ----
create table indi_pub_gold_oa stored as parquet as --create table indi_pub_gold_oa stored as parquet as
WITH gold_oa AS ( --WITH gold_oa AS (
SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn -- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn
FROM stats_ext.oa_journals -- FROM stats_ext.oa_journals
WHERE issn_1 != "" -- WHERE issn_1 != ""
UNION ALL -- UNION ALL
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn -- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn
FROM stats_ext.oa_journals -- FROM stats_ext.oa_journals
WHERE issn_2 != "" ), -- WHERE issn_2 != "" ),
issn AS ( --issn AS (
SELECT * FROM -- SELECT * FROM
(SELECT id, issn_printed as issn -- (SELECT id, issn_printed as issn
FROM datasource WHERE issn_printed IS NOT NULL -- FROM datasource WHERE issn_printed IS NOT NULL
UNION -- UNION
SELECT id, issn_online as issn -- SELECT id, issn_online as issn
FROM datasource WHERE issn_online IS NOT NULL) as issn -- FROM datasource WHERE issn_online IS NOT NULL) as issn
WHERE LENGTH(issn) > 7) -- WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold --SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM publication_datasources pd --FROM publication_datasources pd
LEFT OUTER JOIN ( --LEFT OUTER JOIN (
SELECT pd.id, 1 as is_gold FROM publication_datasources pd -- SELECT pd.id, 1 as is_gold FROM publication_datasources pd
JOIN issn on issn.id=pd.datasource -- JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; -- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
compute stats indi_pub_gold_oa; --compute stats indi_pub_gold_oa;
--
--create table indi_datasets_gold_oa stored as parquet as
--WITH gold_oa AS (
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_1 != ""
-- UNION
-- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_2 != "" ),
--issn AS (
-- SELECT *
-- FROM (
-- SELECT id,issn_printed as issn
-- FROM datasource
-- WHERE issn_printed IS NOT NULL
-- UNION
-- SELECT id, issn_online as issn
-- FROM datasource
-- WHERE issn_online IS NOT NULL ) as issn
-- WHERE LENGTH(issn) > 7)
--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
--FROM dataset_datasources pd
--LEFT OUTER JOIN (
-- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd
-- JOIN issn on issn.id=pd.datasource
-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--
--compute stats indi_datasets_gold_oa;
create table indi_datasets_gold_oa stored as parquet as --create table indi_software_gold_oa stored as parquet as
WITH gold_oa AS ( --WITH gold_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn -- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
FROM stats_ext.oa_journals -- FROM stats_ext.oa_journals
WHERE issn_1 != "" -- WHERE issn_1 != ""
UNION -- UNION
ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn -- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
FROM stats_ext.oa_journals -- FROM stats_ext.oa_journals
WHERE issn_2 != "" ), -- WHERE issn_2 != "" ),
issn AS ( --issn AS (
SELECT * -- SELECT *
FROM ( -- FROM (
SELECT id,issn_printed as issn -- SELECT id,issn_printed as issn
FROM datasource -- FROM datasource
WHERE issn_printed IS NOT NULL -- WHERE issn_printed IS NOT NULL
UNION -- UNION
SELECT id, issn_online as issn -- SELECT id, issn_online as issn
FROM datasource -- FROM datasource
WHERE issn_online IS NOT NULL ) as issn -- WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7) -- WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold --SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM dataset_datasources pd --FROM software_datasources pd
LEFT OUTER JOIN ( --LEFT OUTER JOIN (
SELECT pd.id, 1 as is_gold FROM dataset_datasources pd -- SELECT pd.id, 1 as is_gold FROM software_datasources pd
JOIN issn on issn.id=pd.datasource -- JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; -- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--
compute stats indi_datasets_gold_oa; --compute stats indi_software_gold_oa;
create table indi_software_gold_oa stored as parquet as
WITH gold_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
FROM stats_ext.oa_journals
WHERE issn_1 != ""
UNION
ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
FROM stats_ext.oa_journals
WHERE issn_2 != "" ),
issn AS (
SELECT *
FROM (
SELECT id,issn_printed as issn
FROM datasource
WHERE issn_printed IS NOT NULL
UNION
SELECT id, issn_online as issn
FROM datasource
WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM software_datasources pd
LEFT OUTER JOIN (
SELECT pd.id, 1 as is_gold FROM software_datasources pd
JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
compute stats indi_software_gold_oa;
create table indi_org_findable stored as parquet as
with result_with_pid as (
select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id
group by ro.organization),
result_has_abstract as (
select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro
join result rp on rp.id=ro.id where rp.abstract=true
group by ro.organization),
allresults as (
select organization, count(distinct id) no_allresults from result_organization
group by organization),
result_with_pid_share as (
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share
from allresults
join result_with_pid on result_with_pid.organization=allresults.organization),
result_with_abstract_share as (
select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share
from allresults
join result_has_abstract on result_has_abstract.organization=allresults.organization)
select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable
from allresults
join result_with_pid_share on result_with_pid_share.organization=allresults.organization
left outer join (
select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization;
compute stats indi_org_findable;
create table indi_org_openess stored as parquet as
WITH datasets_oa as (
SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.dataset ds on dg.id=ds.id
WHERE dg.is_gold=1
group by ro.organization),
software_oa as (
SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.software ds on dg.id=ds.id
WHERE dg.is_gold=1
group by ro.organization),
pubs_oa as (
SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.publication ds on dg.id=ds.id
where dg.is_gold=1
group by ro.organization),
allpubs as (
SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join openaire_prod_stats.publication ps on ps.id=ro.id
group by ro.organization),
alldatasets as (
SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join openaire_prod_stats.dataset ps on ps.id=ro.id
group by ro.organization),
allsoftware as (
SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join openaire_prod_stats.software ps on ps.id=ro.id
group by ro.organization),
allpubsshare as (
select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization),
alldatasetssshare as (
select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c
from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization),
allsoftwaresshare as (
select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware
join software_oa on allsoftware.organization=software_oa.organization)
select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess
FROM allpubsshare
left outer join (
select organization,c from
alldatasetssshare) tmp on tmp.organization=allpubsshare.organization
left outer join (
select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization;
compute stats indi_org_openess;
--create table indi_org_findable stored as parquet as
--with result_with_pid as (
-- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
-- join result_pids rp on rp.id=ro.id
-- group by ro.organization),
--result_has_abstract as (
-- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro
-- join result rp on rp.id=ro.id where rp.abstract=true
-- group by ro.organization),
--allresults as (
-- select organization, count(distinct id) no_allresults from result_organization
-- group by organization),
--result_with_pid_share as (
-- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share
-- from allresults
-- join result_with_pid on result_with_pid.organization=allresults.organization),
--result_with_abstract_share as (
-- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share
-- from allresults
-- join result_has_abstract on result_has_abstract.organization=allresults.organization)
--select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable
--from allresults
--join result_with_pid_share on result_with_pid_share.organization=allresults.organization
--left outer join (
-- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization;
--
--compute stats indi_org_findable;
--
--create table indi_org_openess stored as parquet as
--WITH datasets_oa as (
-- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join dataset ds on dg.id=ds.id
-- WHERE dg.is_gold=1
-- group by ro.organization),
--software_oa as (
-- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join software ds on dg.id=ds.id
-- WHERE dg.is_gold=1
-- group by ro.organization),
--pubs_oa as (
-- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join publication ds on dg.id=ds.id
-- where dg.is_gold=1
-- group by ro.organization),
--allpubs as (
-- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
-- join publication ps on ps.id=ro.id
-- group by ro.organization),
--alldatasets as (
-- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
-- join dataset ps on ps.id=ro.id
-- group by ro.organization),
--allsoftware as (
-- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
-- join software ps on ps.id=ro.id
-- group by ro.organization),
--allpubsshare as (
-- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
-- join pubs_oa on allpubs.organization=pubs_oa.organization),
--alldatasetssshare as (
-- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c
-- from alldatasets
-- join datasets_oa on alldatasets.organization=datasets_oa.organization),
--allsoftwaresshare as (
-- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
-- from allsoftware
-- join software_oa on allsoftware.organization=software_oa.organization)
--select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess
--FROM allpubsshare
--left outer join (
-- select organization,c from
-- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization
--left outer join (
-- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization;
--
--compute stats indi_org_openess;
--
create table indi_pub_hybrid_oa_with_cc stored as parquet as create table indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS ( WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
@ -418,3 +452,394 @@ GROUP BY result_id, repository_id, `year`
order by `year` asc, result_id; order by `year` asc, result_id;
compute stats indi_pub_downloads_datasource_year; compute stats indi_pub_downloads_datasource_year;
---- Sprint 7 ----
create table indi_pub_gold_oa stored as parquet as
WITH gold_oa AS ( SELECT
issn_l,
journal_is_in_doaj,
journal_is_oa,
issn_1 as issn
FROM
stats_ext.oa_journals
WHERE
issn_1 != ""
UNION
ALL SELECT
issn_l,
journal_is_in_doaj,
journal_is_oa,
issn_2 as issn
FROM
stats_ext.oa_journals
WHERE
issn_2 != "" ), issn AS ( SELECT
*
FROM
( SELECT
id,
issn_printed as issn
FROM
datasource
WHERE
issn_printed IS NOT NULL
UNION
SELECT
id,
issn_online as issn
FROM
datasource
WHERE
issn_online IS NOT NULL or id like '%doajarticles%') as issn
WHERE
LENGTH(issn) > 7)
SELECT
DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM
publication_datasources pd
left outer join(
select pd.id, 1 as is_gold FROM publication_datasources pd
JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp
on pd.id=tmp.id;
compute stats indi_pub_gold_oa;
create table indi_pub_hybrid stored as parquet as
WITH gold_oa AS ( SELECT
issn_l,
journal_is_in_doaj,
journal_is_oa,
issn_1 as issn,
has_apc
FROM
stats_ext.oa_journals
WHERE
issn_1 != ""
UNION
ALL SELECT
issn_l,
journal_is_in_doaj,
journal_is_oa,
issn_2 as issn,
has_apc
FROM
stats_ext.oa_journals
WHERE
issn_2 != "" ), issn AS ( SELECT
*
FROM
( SELECT
id,
issn_printed as issn
FROM
datasource
WHERE
issn_printed IS NOT NULL
UNION
SELECT
id,
issn_online as issn
FROM
datasource
WHERE
issn_online IS NOT NULL or id like '%doajarticles%') as issn
WHERE
LENGTH(issn) > 7)
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
from publication_datasources pd
left outer join (
select pd.id, 1 as is_hybrid from publication_datasources pd
join datasource d on d.id=pd.datasource
join issn on issn.id=pd.datasource
join gold_oa on issn.issn=gold_oa.issn
where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp
on pd.id=tmp.id;
compute stats indi_pub_hybrid;
create table indi_org_fairness stored as parquet as
--return results with PIDs, and rich metadata group by organization
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result r on r.id=ro.id
--join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003
group by ro.organization),
--return all results group by organization
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where year>2003
group by organization)
--return results_fair/all_results
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness;
create table indi_org_fairness_pub_pr stored as parquet as
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair
from result_organization ro
join publication p on p.id=ro.id
join indi_pub_doi_from_crossref dc on dc.id=p.id
join indi_pub_grey_lit gl on gl.id=p.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null)
and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0
group by ro.organization),
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id
where cast(year as int)>2003
group by organization)
--return results_fair/all_results
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness_pub_pr;
create table indi_org_fairness_pub_year stored as parquet as
with result_fair as
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join publication p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization, year),
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id
where cast(year as int)>2003
group by organization, year)
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year;
compute stats indi_org_fairness_pub_year;
create table indi_org_fairness_pub as
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair
from result_organization ro
join publication p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null)
and (authors>0) and cast(year as int)>2003
group by ro.organization),
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id
where cast(year as int)>2003
group by organization)
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness_pub;
create table indi_org_fairness_year stored as parquet as
with result_fair as
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result r on r.id=ro.id
join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003
group by ro.organization, year),
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where year>2003
group by organization, year)
--return results_fair/all_results
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year;
compute stats indi_org_fairness_year;
create table indi_org_findable_year stored as parquet as
--return results with PIDs group by organization,year
with result_with_pid as
(select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id
join result r on r.id=rp.id
where year >2003
group by ro.organization, year),
--return all results group by organization,year
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where year >2003
group by organization, year)
--return results_with_pid/all_results
select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults
join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year;
compute stats indi_org_findable_year;
create table indi_org_findable stored as parquet as
--return results with PIDs group by organization
with result_with_pid as
(select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id
join result r on r.id=rp.id
where year >2003
group by ro.organization),
--return all results group by organization
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where year >2003
group by organization)
--return results_with_pid/all_results
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults
join result_with_pid on result_with_pid.organization=allresults.organization;
compute stats indi_org_findable;
create table indi_org_openess stored as parquet as
WITH pubs_oa as (
SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
datasets_oa as (
SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
software_oa as (
SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
allpubs as (
SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join publication ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
alldatasets as (
SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join dataset ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
allsoftware as (
SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join software ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
allpubsshare as (
select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization),
alldatasetssshare as (
select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d
from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization),
allsoftwaresshare as (
select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware
join software_oa on allsoftware.organization=software_oa.organization)
select allpubsshare.organization,
(p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare
left outer join (select organization,d from
alldatasetssshare) tmp1
on tmp1.organization=allpubsshare.organization
left outer join (select organization,s from
allsoftwaresshare) tmp2
on tmp2.organization=allpubsshare.organization;
compute stats indi_org_openess;
create table indi_org_openess_year stored as parquet as
WITH pubs_oa as (
SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization,r.year),
datasets_oa as (
SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization, r.year),
software_oa as (
SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization, r.year),
allpubs as (
SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join publication p on p.id=ro.id where cast(p.year as int)>2003
group by ro.organization, p.year),
alldatasets as (
SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join dataset d on d.id=ro.id where cast(d.year as int)>2003
group by ro.organization, d.year),
allsoftware as (
SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join software s on s.id=ro.id where cast(s.year as int)>2003
group by ro.organization, s.year),
allpubsshare as (
select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)),
alldatasetssshare as (
select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d
from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)),
allsoftwaresshare as (
select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware
join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int))
select allpubsshare.year, allpubsshare.organization,
(p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare
left outer join (select year, organization,d from
alldatasetssshare) tmp1
on tmp1.organization=allpubsshare.organization and tmp1.year=allpubsshare.year
left outer join (select year, organization,s from
allsoftwaresshare) tmp2
on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year;
compute stats indi_org_openess_year;
create table indi_pub_has_preprint stored as parquet as
select distinct p.id, coalesce(has_preprint, 0) as has_preprint
from publication_classifications p
left outer join (
select p.id, 1 as has_preprint
from publication_classifications p
where p.type='Preprint') tmp
on p.id= tmp.id;
compute stats indi_pub_has_preprint;
create table indi_pub_in_subscribed stored as parquet as
select distinct p.id, coalesce(is_subscription, 0) as is_subscription
from publication p
left outer join(
select p.id, 1 as is_subscription from publication p
join indi_pub_gold_oa g on p.id=g.id
join indi_pub_hybrid h on p.id=h.id
join indi_pub_in_transformative t on p.id=t.id
where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp
on p.id=tmp.id;
compute stats indi_pub_in_subscribed;
create table indi_result_with_pid as
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
from result p
left outer join (
select p.id, 1 as result_with_pid
from result_pids p) tmp
on p.id= tmp.id;
compute stats indi_result_with_pid;

View File

@ -38,7 +38,14 @@ create table TARGET.result stored as parquet as
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII) 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
'openorgs____::4ac562f0376fce3539504567649cb373' -- University of Patras
) )) foo; ) )) foo;
compute stats TARGET.result; compute stats TARGET.result;
@ -107,6 +114,9 @@ compute stats TARGET.result_sources;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics; compute stats TARGET.result_topics;
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fos;
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;

View File

@ -11,13 +11,13 @@ where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization' WHERE r.reltype = 'projectOrganization' and r.source like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject' WHERE r.reltype = 'resultProject' and r.target like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
create table ${stats_db_name}.project_classification STORED AS PARQUET as create table ${stats_db_name}.project_classification STORED AS PARQUET as

View File

@ -123,6 +123,16 @@ UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_topics; FROM ${stats_db_name}.otherresearchproduct_topics;
create table ${stats_db_name}.result_fos stored as parquet as
with
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
@ -135,3 +145,4 @@ select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS da
FROM ${stats_db_name}.result r FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;