Merge pull request '[stats wf] result_result relations, usage stats, monitor views, indicator for sprint 5' (#179) from antonis.lempesis/dnet-hadoop:beta into beta

Reviewed-on: D-Net/dnet-hadoop#179
This commit is contained in:
Claudio Atzori 2022-01-03 14:52:11 +01:00
commit fe7e5f4748
4 changed files with 125 additions and 266 deletions

View File

@ -27,6 +27,22 @@ CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
SELECT * SELECT *
FROM ${external_stats_db_name}.licenses_normalized; FROM ${external_stats_db_name}.licenses_normalized;
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-- Usage statistics
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
create or replace view ${stats_db_name}.usage_stats as
select * from openaire_prod_usage_stats.usage_stats;
create or replace view ${stats_db_name}.downloads_stats as
select * from openaire_prod_usage_stats.downloads_stats;
create or replace view ${stats_db_name}.pageviews_stats as
select * from openaire_prod_usage_stats.pageviews_stats;
create or replace view ${stats_db_name}.views_stats as
select * from openaire_prod_usage_stats.views_stats;
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------

View File

@ -67,4 +67,17 @@ from (
LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(author) a as auth
LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid) ap as auth_pid
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res;
create table ${stats_db_name}.result_result stored as parquet as
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target
where reltype='resultResult'
and r1.resulttype.classname!=r2.resulttype.classname
and r1.datainfo.deletedbyinference=false
and r2.datainfo.deletedbyinference=false
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false;

View File

@ -1,28 +1,29 @@
---- Sprint 1 ----
create table indi_pub_green_oa stored as parquet as create table indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa select distinct p.id, coalesce(green_oa, 0) as green_oa
from publication p from publication p
left outer join ( left outer join (
select p.id, 1 as green_oa select p.id, 1 as green_oa
from publication p from publication p
join result_instance ri on ri.id = p.id join result_instance ri on ri.id = p.id
join datasource on datasource.id = ri.hostedby join datasource on datasource.id = ri.hostedby
where datasource.type like '%Repository%' where datasource.type like '%Repository%'
and (ri.accessright = 'Open Access' and (ri.accessright = 'Open Access'
or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp
on p.id= tmp.id; on p.id= tmp.id;
create table indi_pub_grey_lit stored as parquet as create table indi_pub_grey_lit stored as parquet as
select distinct p.id, coalesce(grey_lit, 0) as grey_lit select distinct p.id, coalesce(grey_lit, 0) as grey_lit
from publication p from publication p
left outer join ( left outer join (
select p.id, 1 as grey_lit select p.id, 1 as grey_lit
from publication p from publication p
join result_classifications rt on rt.id = p.id join result_classifications rt on rt.id = p.id
where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and
not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id;
create table indi_pub_doi_from_crossref stored as parquet as create table indi_pub_doi_from_crossref stored as parquet as
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from publication p from publication p
left outer join left outer join
(select ri.id, 1 as doi_from_crossref from result_instance ri (select ri.id, 1 as doi_from_crossref from result_instance ri
@ -33,7 +34,7 @@ on tmp.id=p.id;
create table indi_pub_gold_oa stored as parquet as create table indi_pub_gold_oa stored as parquet as
select distinct p.id, coalesce(gold_oa, 0) as gold_oa select distinct p.id, coalesce(gold_oa, 0) as gold_oa
from publication p from publication p
left outer join ( left outer join (
select p.id, 1 as gold_oa select p.id, 1 as gold_oa
from publication p from publication p
join result_instance ri on ri.id = p.id join result_instance ri on ri.id = p.id
@ -41,255 +42,56 @@ join datasource on datasource.id = ri.hostedby
where datasource.id like '%doajarticles%') tmp where datasource.id like '%doajarticles%') tmp
on p.id= tmp.id; on p.id= tmp.id;
--create table indi_project_pubs_count stored as parquet as ---- Sprint 2 ----
--select pr.id id, count(p.id) total_pubs from project_results pr create table indi_result_has_cc_licence stored as parquet as
--join publication p on p.id=pr.result select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
--group by pr.id; from result r
left outer join (select r.id, license.type as lic from result r
--create table indi_project_datasets_count stored as parquet as join result_licenses as license on license.id = r.id
--select pr.id id, count(d.id) total_datasets from project_results pr
--join dataset d on d.id=pr.result
--group by pr.id;
--create table indi_project_software_count stored as parquet as
--select pr.id id, count(s.id) total_software from project_results pr
--join software s on s.id=pr.result
--group by pr.id;
--create table indi_project_otherresearch_count stored as parquet as
--select pr.id id, count(o.id) total_other from project_results pr
--join otherresearchproduct o on o.id=pr.result
--group by pr.id;
--create table indi_pub_avg_year_country_oa stored as parquet as
--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
--from
--(SELECT year, country, SUM(CASE
--WHEN bestlicence='Open Access' THEN 1
--ELSE 0
--END) AS OpenAccess, SUM(CASE
--WHEN bestlicence<>'Open Access' THEN 1
--ELSE 0
--END) AS NonOpenAccess
--FROM publication p
--join result_organization ro on p.id=ro.id
--join organization o on o.id=ro.organization
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by year, country) tmp;
--create table indi_dataset_avg_year_country_oa stored as parquet as
--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
--from
--(SELECT year, country, SUM(CASE
--WHEN bestlicence='Open Access' THEN 1
--ELSE 0
--END) AS OpenAccess, SUM(CASE
--WHEN bestlicence<>'Open Access' THEN 1
--ELSE 0
--END) AS NonOpenAccess
--FROM dataset d
--join result_organization ro on d.id=ro.id
--join organization o on o.id=ro.organization
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by year, country) tmp;
--create table indi_software_avg_year_country_oa stored as parquet as
--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
--from
-- (SELECT year, country, SUM(CASE
--WHEN bestlicence='Open Access' THEN 1
-- ELSE 0
--END) AS OpenAccess, SUM(CASE
-- WHEN bestlicence<>'Open Access' THEN 1
-- ELSE 0
-- END) AS NonOpenAccess
-- FROM software s
-- join result_organization ro on s.id=ro.id
-- join organization o on o.id=ro.organization
-- where cast(year as int)>=2003 and cast(year as int)<=2021
-- group by year, country) tmp;
--create table indi_other_avg_year_country_oa stored as parquet as
--select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
--round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
-- from
-- (SELECT year, country, SUM(CASE
-- WHEN bestlicence='Open Access' THEN 1
-- ELSE 0
-- END) AS OpenAccess, SUM(CASE
-- WHEN bestlicence<>'Open Access' THEN 1
-- ELSE 0
-- END) AS NonOpenAccess
-- FROM otherresearchproduct orp
-- join result_organization ro on orp.id=ro.id
-- join organization o on o.id=ro.organization
-- where cast(year as int)>=2003 and cast(year as int)<=2021
-- group by year, country) tmp;
--create table indi_pub_avg_year_context_oa stored as parquet as
--with total as
--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc
--join context c on pc.concept like concat('%',c.id,'%')
--join publication p on p.id=pc.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by c.name, year )
--select year, name, round(no_of_pubs/total*100,3) averageofpubs
--from total;
--create table indi_dataset_avg_year_context_oa stored as parquet as
--with total as
--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc
--join context c on pc.concept like concat('%',c.id,'%')
--join dataset p on p.id=pc.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by c.name, year )
--select year, name, round(no_of_pubs/total*100,3) averageofdataset
--from total;
--create table indi_software_avg_year_context_oa stored as parquet as
--with total as
--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc
--join context c on pc.concept like concat('%',c.id,'%')
--join software p on p.id=pc.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by c.name, year )
--select year, name, round(no_of_pubs/total*100,3) averageofsoftware
--from total;
--create table indi_other_avg_year_context_oa stored as parquet as
--with total as
--(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc
--join context c on pc.concept like concat('%',c.id,'%')
--join otherresearchproduct p on p.id=pc.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by c.name, year )
--select year, name, round(no_of_pubs/total*100,3) averageofother
--from total;
--create table indi_other_avg_year_content_oa stored as parquet as
--with total as
--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
--from otherresearchproduct_datasources pd
--join datasource d on datasource=d.id
--join otherresearchproduct p on p.id=pd.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by d.type, year)
--select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct
--from total;
--create table indi_software_avg_year_content_oa stored as parquet as
--with total as
--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
--from software_datasources pd
--join datasource d on datasource=d.id
--join software p on p.id=pd.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by d.type, year)
--select year, type, round(no_of_pubs/total*100,3) averageOfSoftware
--from total;
--create table indi_dataset_avg_year_content_oa stored as parquet as
--with total as
--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
--from dataset_datasources pd
--join datasource d on datasource=d.id
--join dataset p on p.id=pd.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by d.type, year)
--select year, type, round(no_of_pubs/total*100,3) averageOfDatasets
--from total;
--create table indi_pub_avg_year_content_oa stored as parquet as
--with total as
--(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
--from publication_datasources pd
--join datasource d on datasource=d.id
--join publication p on p.id=pd.id
--where cast(year as int)>=2003 and cast(year as int)<=2021
--group by d.type, year)
--select year, type, round(no_of_pubs/total*100,3) averageOfPubs
--from total;
create table indi_pub_has_cc_licence stored as parquet as
select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from publication p
left outer join (select p.id, license.type as lic from publication p
join publication_licenses as license on license.id = p.id
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
on p.id= tmp.id; on r.id= tmp.id;
create table indi_pub_has_cc_licence_url stored as parquet as create table indi_result_has_cc_licence_url stored as parquet as
select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from publication p from result r
left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host
from publication p from result r
join publication_licenses as license on license.id = p.id join result_licenses as license on license.id = r.id
WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp
on p.id= tmp.id; on r.id= tmp.id;
-- EOSC-TR1.1-02M:
-- ## Indicator: has_cc_license. Creative Commons licensing has become a
-- de facto standard in scholarly communication and is promoted by many initiatives
-- like Plan S. This indicator might be only useful when applied
-- to openly available publications.
--create table indi_pub_has_cc_licence_tr stored as parquet as
--select distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_tr
--from publication p
--left outer join (select p.id, license.type as lic from publication p
--join publication_licenses as license on license.id = p.id
--where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
--on p.id= tmp.id
-- #EOSC-F2-01M_cc Rich metadata for scholarly publications
-- ## Indicator: has_cc_license. Creative Commons licensing has become a
-- de facto standard in scholarly communication and is promoted by many initiatives
-- like Plan S. This indicator might be only useful when applied
-- to openly available publications.
-- Same indicator as EOSC-TR1.1-02M (Najko's instructions)
-- create table indi_pub_has_cc_licence_f stored as parquet as
-- select
-- distinct p.id, case when lic='' or lic is null then 0 else 1 end as has_cc_license_f
-- from publication p
-- left outer join (selectp.id,license.type as lic from publication p
-- join publication_licenses as license on license.id = p.id
-- where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
-- on p.id= tmp.id
create table indi_pub_has_abstract stored as parquet as create table indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract select distinct publication.id, coalesce(abstract, 1) has_abstract
from publication; from publication;
create table indi_result_with_orcid stored as parquet as create table indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from result r from result r
left outer join (select id, 1 as has_orcid from result_orcid) tmp left outer join (select id, 1 as has_orcid from result_orcid) tmp
on r.id= tmp.id on r.id= tmp.id;
create table indi_funded_result_with_fundref stored as parquet as
---- Sprint 3 ----
create table indi_funded_result_with_fundref stored as parquet as
select distinct r.id, coalesce(fundref, 0) as fundref select distinct r.id, coalesce(fundref, 0) as fundref
from project_results r from project_results r
left outer join (select distinct id, 1 as fundref from project_results left outer join (select distinct id, 1 as fundref from project_results
where provenance='Harvested') tmp where provenance='Harvested') tmp
on r.id= tmp.id on r.id= tmp.id;
create table indi_result_org_country_collab stored as parquet as create table indi_result_org_country_collab stored as parquet as
with tmp as with tmp as
(select o.id as id, o.country , ro.id as result,r.type from organization o (select o.id as id, o.country , ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id where o.country <> 'UNKNOWN') join result r on r.id=ro.id where o.country <> 'UNKNOWN')
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id and o1.country<>o2.country where o1.id<>o2.id and o1.country<>o2.country
group by o1.id, o1.type,o2.country group by o1.id, o1.type,o2.country;
create table indi_result_org_collab stored as parquet as create table indi_result_org_collab stored as parquet as
with tmp as with tmp as
(select o.id, ro.id as result,r.type from organization o (select o.id, ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id) join result r on r.id=ro.id)
@ -297,66 +99,82 @@ select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaboratio
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id where o1.id<>o2.id
group by o1.id, o2.id, o1.type group by o1.id, o2.id, o1.type;
create table indi_funder_country_collab stored as parquet as create table indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op
join organization o on o.id=op.id join organization o on o.id=op.id
join project p on p.id=op.project join project p on p.id=op.project
where country <> 'UNKNOWN') where country <> 'UNKNOWN')
select f1.funder, f1.country, f2.country, count(distinct f1.project) as collaborations select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
from tmp as f1 from tmp as f1
join tmp as f2 on f1.project=f2.project join tmp as f2 on f1.project=f2.project
where f1.country<>f2.country where f1.country<>f2.country
group by f1.funder, f2.country, f1.country group by f1.funder, f2.country, f1.country;
create table indi_result_country_collab stored as parquet as
with tmp as
(select country, ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id)
select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1
join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country
group by o1.country, o2.country, o1.type;
---- Sprint 4 ----
create table indi_pub_diamond stored as parquet as create table indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from publication_datasources pd from publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as in_diamond_journal from publication_datasources pd select pd.id, 1 as in_diamond_journal from publication_datasources pd
join datasource d on d.id=pd.datasource join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
on pd.id=tmp.id on pd.id=tmp.id;
create table indi_pub_hybrid stored as parquet as create table indi_pub_hybrid stored as parquet as
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
from publication_datasources pd from publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as is_hybrid from publication_datasources pd select pd.id, 1 as is_hybrid from publication_datasources pd
join datasource d on d.id=pd.datasource join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
on pd.id=tmp.id on pd.id=tmp.id;
create table indi_is_gold_oa stored as parquet as create table indi_is_gold_oa stored as parquet as
(select distinct pd.id, coalesce(gold_oa, 0) as gold_oa (select distinct pd.id, coalesce(gold_oa, 0) as gold_oa
from publication_datasources pd from publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as gold_oa from publication_datasources pd select pd.id, 1 as gold_oa from publication_datasources pd
join datasource d on d.id=pd.datasource join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps on (ps.issn_print=d.issn_printed or ps.issn_online=d.issn_online) join stats_ext.plan_s_jn ps on (ps.issn_print=d.issn_printed or ps.issn_online=d.issn_online)
where ps.journal_is_in_doaj is true or ps.journal_is_oa is true) tmp where ps.journal_is_in_doaj is true or ps.journal_is_oa is true) tmp
on pd.id=tmp.id) on pd.id=tmp.id);
create table indi_pub_in_transformative stored as parquet as create table indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from publication pd from publication pd
left outer join ( left outer join (
select pd.id, 1 as is_transformative from publication_datasources pd select pd.id, 1 as is_transformative from publication_datasources pd
join datasource d on d.id=pd.datasource join datasource d on d.id=pd.datasource
join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
and ps.is_transformative_journal=true) tmp and ps.is_transformative_journal=true) tmp
on pd.id=tmp.id on pd.id=tmp.id;
create table indi_pub_closed_other_open stored as parquet as create table indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri
left outer join left outer join
(select ri.id, 1 as pub_closed_other_open from result_instance ri (select ri.id, 1 as pub_closed_other_open from result_instance ri
join publication p on p.id=ri.id join publication p on p.id=ri.id
join datasource d on ri.hostedby=d.id join datasource d on ri.hostedby=d.id
where d.type like '%Journal%' and ri.accessright='Closed Access' and where d.type like '%Journal%' and ri.accessright='Closed Access' and
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp
on tmp.id=ri.id on tmp.id=ri.id;
---- Sprint 5 ----
create table indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from result_instance group by id;

View File

@ -49,8 +49,10 @@ compute stats TARGET.result_greenoa;
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_languages; compute stats TARGET.result_languages;
create table TARGET.result_licences stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_licences; compute stats TARGET.result_licenses;
create table TARGET.licenses_normalized as select * from SOURCE.licenses_normalized;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_oids; compute stats TARGET.result_oids;
@ -79,6 +81,9 @@ compute stats TARGET.result_sources;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics; compute stats TARGET.result_topics;
create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source or r.id=orig.target);
compute stats TARGET.result_result;
-- datasources -- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource; create view if not exists TARGET.datasource as select * from SOURCE.datasource;
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
@ -100,6 +105,7 @@ create view if not exists TARGET.project as select * from SOURCE.project;
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
compute stats TARGET.project_results; compute stats TARGET.project_results;
@ -115,10 +121,10 @@ create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.i
compute stats TARGET.indi_pub_gold_oa; compute stats TARGET.indi_pub_gold_oa;
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_has_abstract; compute stats TARGET.indi_pub_has_abstract;
create table TARGET.indi_pub_has_cc_licence stored as parquet as select * from SOURCE.indi_pub_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_has_cc_licence; compute stats TARGET.indi_result_has_cc_licence;
create table TARGET.indi_pub_has_cc_licence_url stored as parquet as select * from SOURCE.indi_pub_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_has_cc_licence_url; compute stats TARGET.indi_result_has_cc_licence_url;
create view TARGET.indi_funder_country_collab stored as select * from SOURCE.indi_funder_country_collab; create view TARGET.indi_funder_country_collab stored as select * from SOURCE.indi_funder_country_collab;
@ -135,6 +141,12 @@ compute stats TARGET.indi_pub_in_transformative;
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_closed_other_open; compute stats TARGET.indi_pub_closed_other_open;
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_no_of_copies;
--- Usage statistics
create table TARGET.usage_stats stored as parquet as select * from SOURCE.usage_stats orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--denorm --denorm
alter table TARGET.result rename to TARGET.res_tmp; alter table TARGET.result rename to TARGET.res_tmp;