Changes to execute the stats wf only in hive

This commit is contained in:
dimitrispie 2023-01-04 11:39:01 +02:00
parent 2a4bf32d4c
commit dcb958e146
11 changed files with 472 additions and 554 deletions

View File

@ -31,8 +31,8 @@ hdfs dfs -copyFromLocal categories.csv ${TMP}
hdfs dfs -copyFromLocal concepts.csv ${TMP} hdfs dfs -copyFromLocal concepts.csv ${TMP}
hdfs dfs -chmod -R 777 ${TMP} hdfs dfs -chmod -R 777 ${TMP}
export HADOOP_USER="antonis.lempesis" export HADOOP_USER="dimitris.pierrakos"
export HADOOP_USER_NAME="antonis.lempesis" export HADOOP_USER_NAME="dimitris.pierrakos"
echo "Creating and populating impala tables" echo "Creating and populating impala tables"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"

View File

@ -8,7 +8,9 @@ fi
export SOURCE=$1 export SOURCE=$1
export SHADOW=$2 export SHADOW=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Updating shadow database" echo "Updating shadow database"
hive --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/^\(.*\)/analyze table ${SOURCE}.\1 compute statistics;/" > foo
hive -f foo hive $HIVE_OPTS -f foo

View File

@ -8,8 +8,8 @@ fi
export TARGET=$1 export TARGET=$1
export SCRIPT_PATH=$2 export SCRIPT_PATH=$2
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450" export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER="antonis.lempesis" export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH

View File

@ -11,10 +11,15 @@ export TARGET=$2
export SHADOW=$3 export SHADOW=$3
export SCRIPT_PATH=$4 export SCRIPT_PATH=$4
echo "Getting file from " $4 export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
hdfs dfs -copyToLocal $4 export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH
echo "Creating monitor database" echo "Creating monitor database"
cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo #cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
hive -f foo cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g" > foo
echo "Impala shell finished" hive $HIVE_OPTS -f foo
echo "Hive shell finished"

View File

@ -12,4 +12,4 @@ export SHADOW=$3
hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo hive --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo
hive -f foo hive -f foo
echo "Impala shell finished" echo "Hive shell finished"

View File

@ -29,6 +29,13 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els
from rcount from rcount
group by rcount.pid; group by rcount.pid;
create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
create view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
create view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
create view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
create view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
create view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
create table ${stats_db_name}.result_instance stored as parquet as create table ${stats_db_name}.result_instance stored as parquet as
select distinct r.* select distinct r.*
from ( from (
@ -44,3 +51,5 @@ from (
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
join ${stats_db_name}.result res on res.id=r.id join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null; where r.amount is not null;
create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset;

View File

@ -1,5 +1,5 @@
-- Sprint 1 ---- -- Sprint 1 ----
create table indi_pub_green_oa stored as parquet as create table if not exists indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa select distinct p.id, coalesce(green_oa, 0) as green_oa
from publication p from publication p
left outer join ( left outer join (
@ -12,9 +12,9 @@ from publication p
or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp
on p.id= tmp.id; on p.id= tmp.id;
compute stats indi_pub_green_oa; ANALYZE TABLE indi_pub_green_oa COMPUTE STATISTICS;
create table indi_pub_grey_lit stored as parquet as create table if not exists indi_pub_grey_lit stored as parquet as
select distinct p.id, coalesce(grey_lit, 0) as grey_lit select distinct p.id, coalesce(grey_lit, 0) as grey_lit
from publication p from publication p
left outer join ( left outer join (
@ -25,9 +25,9 @@ from publication p
not exists (select 1 from result_classifications rc where type ='Other literature type' not exists (select 1 from result_classifications rc where type ='Other literature type'
and rc.id=p.id)) tmp on p.id=tmp.id; and rc.id=p.id)) tmp on p.id=tmp.id;
compute stats indi_pub_grey_lit; ANALYZE TABLE indi_pub_grey_lit COMPUTE STATISTICS;
create table indi_pub_doi_from_crossref stored as parquet as create table if not exists indi_pub_doi_from_crossref stored as parquet as
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from publication p from publication p
left outer join left outer join
@ -36,10 +36,10 @@ from publication p
where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp
on tmp.id=p.id; on tmp.id=p.id;
compute stats indi_pub_doi_from_crossref; ANALYZE TABLE indi_pub_doi_from_crossref COMPUTE STATISTICS;
-- Sprint 2 ---- -- Sprint 2 ----
create table indi_result_has_cc_licence stored as parquet as create table if not exists indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from result r from result r
left outer join (select r.id, license.type as lic from result r left outer join (select r.id, license.type as lic from result r
@ -47,9 +47,9 @@ from result r
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
on r.id= tmp.id; on r.id= tmp.id;
compute stats indi_result_has_cc_licence; ANALYZE TABLE indi_result_has_cc_licence COMPUTE STATISTICS;
create table indi_result_has_cc_licence_url stored as parquet as create table if not exists indi_result_has_cc_licence_url stored as parquet as
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from result r from result r
left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host
@ -58,31 +58,31 @@ from result r
WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp
on r.id= tmp.id; on r.id= tmp.id;
compute stats indi_result_has_cc_licence_url; ANALYZE TABLE indi_result_has_cc_licence_url COMPUTE STATISTICS;
create table indi_pub_has_abstract stored as parquet as create table if not exists indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
from publication; from publication;
compute stats indi_pub_has_abstract; ANALYZE TABLE indi_pub_has_abstract COMPUTE STATISTICS;
create table indi_result_with_orcid stored as parquet as create table if not exists indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from result r from result r
left outer join (select id, 1 as has_orcid from result_orcid) tmp left outer join (select id, 1 as has_orcid from result_orcid) tmp
on r.id= tmp.id; on r.id= tmp.id;
compute stats indi_result_with_orcid; ANALYZE TABLE indi_result_with_orcid COMPUTE STATISTICS;
---- Sprint 3 ---- ---- Sprint 3 ----
create table indi_funded_result_with_fundref stored as parquet as create table if not exists indi_funded_result_with_fundref stored as parquet as
select distinct r.result as id, coalesce(fundref, 0) as fundref select distinct r.result as id, coalesce(fundref, 0) as fundref
from project_results r from project_results r
left outer join (select distinct result, 1 as fundref from project_results left outer join (select distinct result, 1 as fundref from project_results
where provenance='Harvested') tmp where provenance='Harvested') tmp
on r.result= tmp.result; on r.result= tmp.result;
compute stats indi_funded_result_with_fundref; ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS;
-- create table indi_result_org_collab stored as parquet as -- create table indi_result_org_collab stored as parquet as
-- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations -- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations
@ -92,77 +92,59 @@ compute stats indi_funded_result_with_fundref;
-- --
-- compute stats indi_result_org_collab; -- compute stats indi_result_org_collab;
-- --
create table indi_result_org_collab stored as parquet as create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id from result_organization ro
with tmp as ( join organization o on o.id=ro.organization where o.name is not null;
select distinct ro.organization organization, ro.id from result_organization ro
join organization o on o.id=ro.organization where o.name is not null) create table if not exists indi_result_org_collab stored as parquet as
select o1.organization org1, o2.organization org2, count(o1.id) as collaborations select o1.organization org1, o2.organization org2, count(o1.id) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.id=o2.id and o1.organization!=o2.organization join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization
group by org1, org2; group by o1.organization, o2.organization;
compute stats indi_result_org_collab; drop table tmp purge;
-- create table indi_result_org_country_collab stored as parquet as ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS;
-- with tmp as
-- (select o.id as id, o.country , ro.id as result,r.type from organization o create TEMPORARY TABLE tmp AS
-- join result_organization ro on o.id=ro.organization select distinct ro.organization organization, ro.id, o.country from result_organization ro
-- join result r on r.id=ro.id where o.country <> 'UNKNOWN') join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null;
-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
-- from tmp as o1 create table if not exists indi_result_org_country_collab stored as parquet as
-- join tmp as o2 on o1.result=o2.result
-- where o1.id<>o2.id and o1.country<>o2.country
-- group by o1.id, o1.type,o2.country;
--
-- compute stats indi_result_org_country_collab;
--
create table indi_result_org_country_collab stored as parquet as
with tmp as
(select distinct ro.organization organization, ro.id, o.country from result_organization ro
join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null)
select o1.organization org1,o2.country country2, count(o1.id) as collaborations select o1.organization org1,o2.country country2, count(o1.id) as collaborations
from tmp as o1 join tmp as o2 on o1.id=o2.id from tmp as o1 join tmp as o2 on o1.id=o2.id
where o1.id=o2.id and o1.country!=o2.country where o1.id=o2.id and o1.country!=o2.country
group by o1.organization, o1.id, o2.country; group by o1.organization, o1.id, o2.country;
compute stats indi_result_org_country_collab; drop table tmp purge;
-- create table indi_result_org_collab stored as parquet as ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS;
-- with tmp as
-- (select o.id, ro.id as result,r.type from organization o create table if not exists indi_project_collab_org stored as parquet as
-- join result_organization ro on o.id=ro.organization
-- join result r on r.id=ro.id)
-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
-- from tmp as o1
-- join tmp as o2 on o1.result=o2.result
-- where o1.id<>o2.id
-- group by o1.id, o2.id, o1.type;
--
-- compute stats indi_result_org_collab;
--
create table indi_project_collab_org stored as parquet as
select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations
from organization_projects as o1 from organization_projects as o1
join organization_projects as o2 on o1.project=o2.project join organization_projects as o2 on o1.project=o2.project
where o1.id!=o2.id where o1.id!=o2.id
group by o1.id, o2.id; group by o1.id, o2.id;
compute stats indi_project_collab_org; ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS;
create table indi_project_collab_org_country stored as parquet as create TEMPORARY TABLE tmp AS
with tmp as select o.id organization, o.country , ro.project as project from organization o
(select o.id organization, o.country , ro.project as project from organization o
join organization_projects ro on o.id=ro.id join organization_projects ro on o.id=ro.id
and o.country <> 'UNKNOWN') and o.country <> 'UNKNOWN';
create table if not exists indi_project_collab_org_country stored as parquet as
select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country where o1.organization<>o2.organization and o1.country<>o2.country
group by o1.organization, o2.country; group by o1.organization, o2.country;
compute stats indi_project_collab_org_country; drop table tmp purge;
create table indi_funder_country_collab stored as parquet as ANALYZE TABLE indi_project_collab_org_country COMPUTE STATISTICS;
create table if not exists indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op
join organization o on o.id=op.id join organization o on o.id=op.id
join project p on p.id=op.project join project p on p.id=op.project
@ -173,36 +155,26 @@ from tmp as f1
where f1.country<>f2.country where f1.country<>f2.country
group by f1.funder, f2.country, f1.country; group by f1.funder, f2.country, f1.country;
compute stats indi_funder_country_collab; ANALYZE TABLE indi_funder_country_collab COMPUTE STATISTICS;
--
-- create table indi_result_country_collab stored as parquet as
-- with tmp as
-- (select country, ro.id as result,r.type from organization o
-- join result_organization ro on o.id=ro.organization
-- join result r on r.id=ro.id where country <> 'UNKNOWN')
-- select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations
-- from tmp as o1
-- join tmp as o2 on o1.result=o2.result
-- where o1.country<>o2.country
-- group by o1.country, o2.country, o1.type;
--
-- compute stats indi_result_country_collab;
create table indi_result_country_collab stored as parquet as create TEMPORARY TABLE tmp AS
with tmp as select distinct country, ro.id as result from organization o
(select distinct country, ro.id as result from organization o
join result_organization ro on o.id=ro.organization join result_organization ro on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null) where country <> 'UNKNOWN' and o.name is not null;
create table if not exists indi_result_country_collab stored as parquet as
select o1.country country1, o2.country country2, count(o1.result) as collaborations select o1.country country1, o2.country country2, count(o1.result) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country where o1.country<>o2.country
group by o1.country, o2.country; group by o1.country, o2.country;
compute stats indi_result_country_collab; drop table tmp purge;
ANALYZE TABLE indi_result_country_collab COMPUTE STATISTICS;
---- Sprint 4 ---- ---- Sprint 4 ----
create table indi_pub_diamond stored as parquet as create table if not exists indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from publication_datasources pd from publication_datasources pd
left outer join ( left outer join (
@ -212,21 +184,9 @@ from publication_datasources pd
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
on pd.id=tmp.id; on pd.id=tmp.id;
compute stats indi_pub_diamond; ANALYZE TABLE indi_pub_diamond COMPUTE STATISTICS;
--create table indi_pub_hybrid stored as parquet as create table if not exists indi_pub_in_transformative stored as parquet as
--select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
--from publication_datasources pd
-- left outer join (
-- select pd.id, 1 as is_hybrid from publication_datasources pd
-- join datasource d on d.id=pd.datasource
-- join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online)
-- and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
-- on pd.id=tmp.id;
--
--compute stats indi_pub_hybrid;
create table indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from publication pd from publication pd
left outer join ( left outer join (
@ -236,9 +196,9 @@ from publication pd
and ps.is_transformative_journal=true) tmp and ps.is_transformative_journal=true) tmp
on pd.id=tmp.id; on pd.id=tmp.id;
compute stats indi_pub_in_transformative; ANALYZE TABLE indi_pub_in_transformative COMPUTE STATISTICS;
create table indi_pub_closed_other_open stored as parquet as create table if not exists indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri
left outer join left outer join
(select ri.id, 1 as pub_closed_other_open from result_instance ri (select ri.id, 1 as pub_closed_other_open from result_instance ri
@ -248,180 +208,16 @@ select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_op
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp
on tmp.id=ri.id; on tmp.id=ri.id;
compute stats indi_pub_closed_other_open; ANALYZE TABLE indi_pub_closed_other_open COMPUTE STATISTICS;
---- Sprint 5 ---- ---- Sprint 5 ----
create table indi_result_no_of_copies stored as parquet as create table if not exists indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from result_instance group by id; select id, count(id) as number_of_copies from result_instance group by id;
compute stats indi_result_no_of_copies; ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ---- ---- Sprint 6 ----
--create table indi_pub_gold_oa stored as parquet as create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
--WITH gold_oa AS (
-- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_1 != ""
-- UNION ALL
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_2 != "" ),
--issn AS (
-- SELECT * FROM
-- (SELECT id, issn_printed as issn
-- FROM datasource WHERE issn_printed IS NOT NULL
-- UNION
-- SELECT id, issn_online as issn
-- FROM datasource WHERE issn_online IS NOT NULL) as issn
-- WHERE LENGTH(issn) > 7)
--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
--FROM publication_datasources pd
--LEFT OUTER JOIN (
-- SELECT pd.id, 1 as is_gold FROM publication_datasources pd
-- JOIN issn on issn.id=pd.datasource
-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--compute stats indi_pub_gold_oa;
--
--create table indi_datasets_gold_oa stored as parquet as
--WITH gold_oa AS (
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_1 != ""
-- UNION
-- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_2 != "" ),
--issn AS (
-- SELECT *
-- FROM (
-- SELECT id,issn_printed as issn
-- FROM datasource
-- WHERE issn_printed IS NOT NULL
-- UNION
-- SELECT id, issn_online as issn
-- FROM datasource
-- WHERE issn_online IS NOT NULL ) as issn
-- WHERE LENGTH(issn) > 7)
--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
--FROM dataset_datasources pd
--LEFT OUTER JOIN (
-- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd
-- JOIN issn on issn.id=pd.datasource
-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--
--compute stats indi_datasets_gold_oa;
--create table indi_software_gold_oa stored as parquet as
--WITH gold_oa AS (
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_1 != ""
-- UNION
-- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_2 != "" ),
--issn AS (
-- SELECT *
-- FROM (
-- SELECT id,issn_printed as issn
-- FROM datasource
-- WHERE issn_printed IS NOT NULL
-- UNION
-- SELECT id, issn_online as issn
-- FROM datasource
-- WHERE issn_online IS NOT NULL ) as issn
-- WHERE LENGTH(issn) > 7)
--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
--FROM software_datasources pd
--LEFT OUTER JOIN (
-- SELECT pd.id, 1 as is_gold FROM software_datasources pd
-- JOIN issn on issn.id=pd.datasource
-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--
--compute stats indi_software_gold_oa;
--create table indi_org_findable stored as parquet as
--with result_with_pid as (
-- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
-- join result_pids rp on rp.id=ro.id
-- group by ro.organization),
--result_has_abstract as (
-- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro
-- join result rp on rp.id=ro.id where rp.abstract=true
-- group by ro.organization),
--allresults as (
-- select organization, count(distinct id) no_allresults from result_organization
-- group by organization),
--result_with_pid_share as (
-- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share
-- from allresults
-- join result_with_pid on result_with_pid.organization=allresults.organization),
--result_with_abstract_share as (
-- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share
-- from allresults
-- join result_has_abstract on result_has_abstract.organization=allresults.organization)
--select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable
--from allresults
--join result_with_pid_share on result_with_pid_share.organization=allresults.organization
--left outer join (
-- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization;
--
--compute stats indi_org_findable;
--
--create table indi_org_openess stored as parquet as
--WITH datasets_oa as (
-- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join dataset ds on dg.id=ds.id
-- WHERE dg.is_gold=1
-- group by ro.organization),
--software_oa as (
-- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join software ds on dg.id=ds.id
-- WHERE dg.is_gold=1
-- group by ro.organization),
--pubs_oa as (
-- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join publication ds on dg.id=ds.id
-- where dg.is_gold=1
-- group by ro.organization),
--allpubs as (
-- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
-- join publication ps on ps.id=ro.id
-- group by ro.organization),
--alldatasets as (
-- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
-- join dataset ps on ps.id=ro.id
-- group by ro.organization),
--allsoftware as (
-- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
-- join software ps on ps.id=ro.id
-- group by ro.organization),
--allpubsshare as (
-- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
-- join pubs_oa on allpubs.organization=pubs_oa.organization),
--alldatasetssshare as (
-- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c
-- from alldatasets
-- join datasets_oa on alldatasets.organization=datasets_oa.organization),
--allsoftwaresshare as (
-- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
-- from allsoftware
-- join software_oa on allsoftware.organization=software_oa.organization)
--select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess
--FROM allpubsshare
--left outer join (
-- select organization,c from
-- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization
--left outer join (
-- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization;
--
--compute stats indi_org_openess;
--
create table indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS ( WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
FROM stats_ext.plan_s_jn FROM stats_ext.plan_s_jn
@ -436,7 +232,7 @@ create table indi_pub_hybrid_oa_with_cc stored as parquet as
SELECT id, issn_printed as issn SELECT id, issn_printed as issn
FROM datasource FROM datasource
WHERE issn_printed IS NOT NULL WHERE issn_printed IS NOT NULL
UNION UNION ALL
SELECT id,issn_online as issn SELECT id,issn_online as issn
FROM datasource FROM datasource
WHERE issn_online IS NOT NULL ) as issn WHERE issn_online IS NOT NULL ) as issn
@ -451,45 +247,44 @@ FROM publication_datasources pd
JOIN indi_result_has_cc_licence cc on pd.id=cc.id JOIN indi_result_has_cc_licence cc on pd.id=cc.id
where cc.has_cc_license=1) tmp on pd.id=tmp.id; where cc.has_cc_license=1) tmp on pd.id=tmp.id;
compute stats indi_pub_hybrid_oa_with_cc; ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table indi_pub_downloads stored as parquet as create table if not exists indi_pub_downloads stored as parquet as
SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats
join publication on result_id=id join publication on result_id=id
where downloads>0 where downloads>0
GROUP BY result_id GROUP BY result_id
order by no_downloads desc; order by no_downloads desc;
compute stats indi_pub_downloads; ANALYZE TABLE indi_pub_downloads COMPUTE STATISTICS;
create table indi_pub_downloads_datasource stored as parquet as create table if not exists indi_pub_downloads_datasource stored as parquet as
SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats
join publication on result_id=id join publication on result_id=id
where downloads>0 where downloads>0
GROUP BY result_id, repository_id GROUP BY result_id, repository_id
order by result_id; order by result_id;
compute stats indi_pub_downloads_datasource; ANALYZE TABLE indi_pub_downloads_datasource COMPUTE STATISTICS;
create table indi_pub_downloads_year stored as parquet as create table if not exists indi_pub_downloads_year stored as parquet as
SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us
join publication on result_id=id where downloads>0 join publication on result_id=id where downloads>0
GROUP BY result_id, `year` GROUP BY result_id, substring(us.`date`, 1,4);
order by `year` asc;
compute stats indi_pub_downloads_year; ANALYZE TABLE indi_pub_downloads_year COMPUTE STATISTICS;
create table indi_pub_downloads_datasource_year stored as parquet as create table if not exists indi_pub_downloads_datasource_year stored as parquet as
SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us
join publication on result_id=id join publication on result_id=id
where downloads>0 where downloads>0
GROUP BY result_id, repository_id, `year` GROUP BY result_id, repository_id, substring(us.`date`, 1,4);
order by `year` asc, result_id;
compute stats indi_pub_downloads_datasource_year; ANALYZE TABLE indi_pub_downloads_datasource_year COMPUTE STATISTICS;
---- Sprint 7 ---- ---- Sprint 7 ----
create table indi_pub_gold_oa stored as parquet as create table if not exists indi_pub_gold_oa stored as parquet as
WITH gold_oa AS ( SELECT WITH gold_oa AS ( SELECT
issn_l, issn_l,
journal_is_in_doaj, journal_is_in_doaj,
@ -518,7 +313,7 @@ create table indi_pub_gold_oa stored as parquet as
datasource datasource
WHERE WHERE
issn_printed IS NOT NULL issn_printed IS NOT NULL
UNION UNION ALL
SELECT SELECT
id, id,
issn_online as issn issn_online as issn
@ -538,9 +333,9 @@ FROM
JOIN gold_oa on issn.issn = gold_oa.issn) tmp JOIN gold_oa on issn.issn = gold_oa.issn) tmp
on pd.id=tmp.id; on pd.id=tmp.id;
compute stats indi_pub_gold_oa; ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS;
create table indi_pub_hybrid stored as parquet as create table if not exists indi_pub_hybrid stored as parquet as
WITH gold_oa AS ( SELECT WITH gold_oa AS ( SELECT
issn_l, issn_l,
journal_is_in_doaj, journal_is_in_doaj,
@ -571,7 +366,7 @@ create table indi_pub_hybrid stored as parquet as
datasource datasource
WHERE WHERE
issn_printed IS NOT NULL issn_printed IS NOT NULL
UNION UNION ALL
SELECT SELECT
id, id,
issn_online as issn issn_online as issn
@ -591,15 +386,15 @@ from publication_datasources pd
where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp
on pd.id=tmp.id; on pd.id=tmp.id;
compute stats indi_pub_hybrid; ANALYZE TABLE indi_pub_hybrid COMPUTE STATISTICS;
create table indi_org_fairness stored as parquet as create table if not exists indi_org_fairness stored as parquet as
--return results with PIDs, and rich metadata group by organization --return results with PIDs, and rich metadata group by organization
with result_fair as with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result r on r.id=ro.id join result r on r.id=ro.id
--join result_pids rp on r.id=rp.id --join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization), group by ro.organization),
--return all results group by organization --return all results group by organization
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
@ -611,16 +406,16 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization; join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness; ANALYZE TABLE indi_org_fairness COMPUTE STATISTICS;
create table indi_org_fairness_pub_pr stored as parquet as create table if not exists indi_org_fairness_pub_pr stored as parquet as
with result_fair as with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair (select ro.organization organization, count(distinct ro.id) no_result_fair
from result_organization ro from result_organization ro
join publication p on p.id=ro.id join publication p on p.id=ro.id
join indi_pub_doi_from_crossref dc on dc.id=p.id join indi_pub_doi_from_crossref dc on dc.id=p.id
join indi_pub_grey_lit gl on gl.id=p.id join indi_pub_grey_lit gl on gl.id=p.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null)
and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0 and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0
group by ro.organization), group by ro.organization),
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
@ -632,150 +427,180 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization; join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness_pub_pr; ANALYZE TABLE indi_org_fairness_pub_pr COMPUTE STATISTICS;
create table indi_org_fairness_pub_year stored as parquet as CREATE TEMPORARY table result_fair as
with result_fair as select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result p on p.id=ro.id
join publication p on p.id=ro.id where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization, year;
group by ro.organization, year),
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro CREATE TEMPORARY TABLE allresults as select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id join result p on p.id=ro.id
where cast(year as int)>2003 where cast(year as int)>2003
group by organization, year) group by organization, year;
create table if not exists indi_org_fairness_pub_year stored as parquet as
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year;
compute stats indi_org_fairness_pub_year; DROP table result_fair purge;
DROP table allresults purge;
create table indi_org_fairness_pub as ANALYZE TABLE indi_org_fairness_pub_year COMPUTE STATISTICS;
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair CREATE TEMPORARY TABLE result_fair as
select ro.organization organization, count(distinct ro.id) no_result_fair
from result_organization ro from result_organization ro
join publication p on p.id=ro.id join result p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null)
and (authors>0) and cast(year as int)>2003 and (authors>0) and cast(year as int)>2003
group by ro.organization), group by ro.organization;
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id CREATE TEMPORARY TABLE allresults as
select organization, count(distinct ro.id) no_allresults from result_organization ro
join result p on p.id=ro.id
where cast(year as int)>2003 where cast(year as int)>2003
group by organization) group by organization;
create table if not exists indi_org_fairness_pub as
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults join result_fair on result_fair.organization=allresults.organization;
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness_pub; DROP table result_fair purge;
DROP table allresults purge;
create table indi_org_fairness_year stored as parquet as ANALYZE TABLE indi_org_fairness_pub COMPUTE STATISTICS;
with result_fair as
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro CREATE TEMPORARY TABLE result_fair as
select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result r on r.id=ro.id join result r on r.id=ro.id
join result_pids rp on r.id=rp.id join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization, year), group by ro.organization, year;
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
CREATE TEMPORARY TABLE allresults as
select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id join result r on r.id=ro.id
where cast(year as int)>2003 where cast(year as int)>2003
group by organization, year) group by organization, year;
--return results_fair/all_results
create table if not exists indi_org_fairness_year stored as parquet as
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year;
compute stats indi_org_fairness_year; DROP table result_fair purge;
DROP table allresults purge;
create table indi_org_findable_year stored as parquet as ANALYZE TABLE indi_org_fairness_year COMPUTE STATISTICS;
--return results with PIDs group by organization,year
with result_with_pid as CREATE TEMPORARY TABLE result_with_pid as
(select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id join result_pids rp on rp.id=ro.id
join result r on r.id=rp.id join result r on r.id=rp.id
where cast(year as int) >2003 where cast(year as int) >2003
group by ro.organization, year), group by ro.organization, year;
--return all results group by organization,year
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro CREATE TEMPORARY TABLE allresults as
select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id join result r on r.id=ro.id
where cast(year as int) >2003 where cast(year as int) >2003
group by organization, year) group by organization, year;
--return results_with_pid/all_results
create table if not exists indi_org_findable_year stored as parquet as
select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults from allresults
join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year; join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year;
compute stats indi_org_findable_year; DROP table result_with_pid purge;
DROP table allresults purge;
create table indi_org_findable stored as parquet as ANALYZE TABLE indi_org_findable_year COMPUTE STATISTICS;
--return results with PIDs group by organization
with result_with_pid as CREATE TEMPORARY TABLE result_with_pid as
(select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id join result_pids rp on rp.id=ro.id
join result r on r.id=rp.id join result r on r.id=rp.id
where cast(year as int) >2003 where cast(year as int) >2003
group by ro.organization), group by ro.organization;
--return all results group by organization
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro CREATE TEMPORARY TABLE allresults as
select organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id join result r on r.id=ro.id
where cast(year as int) >2003 where cast(year as int) >2003
group by organization) group by organization;
--return results_with_pid/all_results
create table if not exists indi_org_findable stored as parquet as
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults from allresults
join result_with_pid on result_with_pid.organization=allresults.organization; join result_with_pid on result_with_pid.organization=allresults.organization;
compute stats indi_org_findable; DROP table result_with_pid purge;
DROP table allresults purge;
create table indi_org_openess stored as parquet as ANALYZE TABLE indi_org_findable COMPUTE STATISTICS;
WITH pubs_oa as (
CREATE TEMPORARY TABLE pubs_oa as
SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r
join result_organization ro on ro.id=r.id join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003 and cast(r.year as int)>2003
group by ro.organization), group by ro.organization;
datasets_oa as (
CREATE TEMPORARY TABLE datasets_oa as
SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r
join result_organization ro on ro.id=r.id join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003 and cast(r.year as int)>2003
group by ro.organization), group by ro.organization;
software_oa as (
CREATE TEMPORARY TABLE software_oa as
SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r
join result_organization ro on ro.id=r.id join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003 and cast(r.year as int)>2003
group by ro.organization), group by ro.organization;
allpubs as (
CREATE TEMPORARY TABLE allpubs as
SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join publication ps on ps.id=ro.id join publication ps on ps.id=ro.id
where cast(ps.year as int)>2003 where cast(ps.year as int)>2003
group by ro.organization), group by ro.organization;
alldatasets as (
CREATE TEMPORARY TABLE alldatasets as
SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join dataset ps on ps.id=ro.id join dataset ps on ps.id=ro.id
where cast(ps.year as int)>2003 where cast(ps.year as int)>2003
group by ro.organization), group by ro.organization;
allsoftware as (
CREATE TEMPORARY TABLE allsoftware as
SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join software ps on ps.id=ro.id join software ps on ps.id=ro.id
where cast(ps.year as int)>2003 where cast(ps.year as int)>2003
group by ro.organization), group by ro.organization;
allpubsshare as (
CREATE TEMPORARY TABLE allpubsshare as
select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization), join pubs_oa on allpubs.organization=pubs_oa.organization;
alldatasetssshare as (
CREATE TEMPORARY TABLE alldatasetssshare as
select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d
from alldatasets from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization), join datasets_oa on alldatasets.organization=datasets_oa.organization;
allsoftwaresshare as (
CREATE TEMPORARY TABLE allsoftwaresshare as
select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware from allsoftware
join software_oa on allsoftware.organization=software_oa.organization) join software_oa on allsoftware.organization=software_oa.organization;
create table if not exists indi_org_openess stored as parquet as
select allpubsshare.organization, select allpubsshare.organization,
(p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare org_openess FROM allpubsshare
left outer join (select organization,d from left outer join (select organization,d from
@ -785,55 +610,75 @@ select allpubsshare.organization,
allsoftwaresshare) tmp2 allsoftwaresshare) tmp2
on tmp2.organization=allpubsshare.organization; on tmp2.organization=allpubsshare.organization;
compute stats indi_org_openess; DROP TABLE pubs_oa purge;
DROP TABLE datasets_oa purge;
DROP TABLE software_oa purge;
DROP TABLE allpubs purge;
DROP TABLE alldatasets purge;
DROP TABLE allsoftware purge;
DROP TABLE allpubsshare purge;
DROP TABLE alldatasetssshare purge;
DROP TABLE allsoftwaresshare purge;
create table indi_org_openess_year stored as parquet as ANALYZE TABLE indi_org_openess COMPUTE STATISTICS;
WITH pubs_oa as (
CREATE TEMPORARY TABLE pubs_oa AS
SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r
join result_organization ro on ro.id=r.id join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003 and cast(r.year as int)>2003
group by ro.organization,r.year), group by ro.organization,r.year;
datasets_oa as (
CREATE TEMPORARY TABLE datasets_oa AS
SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r
join result_organization ro on ro.id=r.id join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003 and cast(r.year as int)>2003
group by ro.organization, r.year), group by ro.organization, r.year;
software_oa as (
CREATE TEMPORARY TABLE software_oa AS
SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r
join result_organization ro on ro.id=r.id join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003 and cast(r.year as int)>2003
group by ro.organization, r.year), group by ro.organization, r.year;
allpubs as (
CREATE TEMPORARY TABLE allpubs as
SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join publication p on p.id=ro.id where cast(p.year as int)>2003 join publication p on p.id=ro.id where cast(p.year as int)>2003
group by ro.organization, p.year), group by ro.organization, p.year;
alldatasets as (
CREATE TEMPORARY TABLE alldatasets as
SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join dataset d on d.id=ro.id where cast(d.year as int)>2003 join dataset d on d.id=ro.id where cast(d.year as int)>2003
group by ro.organization, d.year), group by ro.organization, d.year;
allsoftware as (
CREATE TEMPORARY TABLE allsoftware as
SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join software s on s.id=ro.id where cast(s.year as int)>2003 join software s on s.id=ro.id where cast(s.year as int)>2003
group by ro.organization, s.year), group by ro.organization, s.year;
allpubsshare as (
CREATE TEMPORARY TABLE allpubsshare as
select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)), join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int);
alldatasetssshare as (
CREATE TEMPORARY TABLE alldatasetssshare as
select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d
from alldatasets from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)), join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int);
allsoftwaresshare as (
CREATE TEMPORARY TABLE allsoftwaresshare as
select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware from allsoftware
join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int)) join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int);
create table if not exists indi_org_openess_year stored as parquet as
select allpubsshare.year, allpubsshare.organization, select allpubsshare.year, allpubsshare.organization,
(p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare org_openess FROM allpubsshare
left outer join (select year, organization,d from left outer join (select year, organization,d from
@ -843,9 +688,19 @@ select allpubsshare.year, allpubsshare.organization,
allsoftwaresshare) tmp2 allsoftwaresshare) tmp2
on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year; on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year;
compute stats indi_org_openess_year; DROP TABLE pubs_oa purge;
DROP TABLE datasets_oa purge;
DROP TABLE software_oa purge;
DROP TABLE allpubs purge;
DROP TABLE alldatasets purge;
DROP TABLE allsoftware purge;
DROP TABLE allpubsshare purge;
DROP TABLE alldatasetssshare purge;
DROP TABLE allsoftwaresshare purge;
create table indi_pub_has_preprint stored as parquet as ANALYZE TABLE indi_org_openess_year COMPUTE STATISTICS;
create table if not exists indi_pub_has_preprint stored as parquet as
select distinct p.id, coalesce(has_preprint, 0) as has_preprint select distinct p.id, coalesce(has_preprint, 0) as has_preprint
from publication_classifications p from publication_classifications p
left outer join ( left outer join (
@ -854,9 +709,9 @@ from publication_classifications p
where p.type='Preprint') tmp where p.type='Preprint') tmp
on p.id= tmp.id; on p.id= tmp.id;
compute stats indi_pub_has_preprint; ANALYZE TABLE indi_pub_has_preprint COMPUTE STATISTICS;
create table indi_pub_in_subscribed stored as parquet as create table if not exists indi_pub_in_subscribed stored as parquet as
select distinct p.id, coalesce(is_subscription, 0) as is_subscription select distinct p.id, coalesce(is_subscription, 0) as is_subscription
from publication p from publication p
left outer join( left outer join(
@ -867,9 +722,9 @@ from publication p
where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp
on p.id=tmp.id; on p.id=tmp.id;
compute stats indi_pub_in_subscribed; ANALYZE TABLE indi_pub_in_subscribed COMPUTE STATISTICS;
create table indi_result_with_pid as create table if not exists indi_result_with_pid as
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
from result p from result p
left outer join ( left outer join (
@ -877,4 +732,4 @@ from result p
from result_pids p) tmp from result_pids p) tmp
on p.id= tmp.id; on p.id= tmp.id;
compute stats indi_result_with_pid; ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS;

View File

@ -10,6 +10,11 @@ create view if not exists TARGET.creation_date as select * from SOURCE.creation_
create view if not exists TARGET.funder as select * from SOURCE.funder; create view if not exists TARGET.funder as select * from SOURCE.funder;
create view if not exists TARGET.fundref as select * from SOURCE.fundref; create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
create table TARGET.result stored as parquet as create table TARGET.result stored as parquet as
select distinct * from ( select distinct * from (
@ -54,84 +59,87 @@ create table TARGET.result stored as parquet as
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483' -- University College Cork
) )) foo; ) )) foo;
compute stats TARGET.result;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations; ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_references_oc; ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations_oc; ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_classifications; ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_apc; ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_concepts; ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_datasources; ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fundercount; ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_gold; ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_greenoa; ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_languages; ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_licenses; ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_oids; ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_organization; ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_peerreviewed; ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_pids; ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_projectcount; ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_projects; ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_refereed; ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_sources; ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics; ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fos; ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1; drop view TARGET.foo1;
drop view TARGET.foo2; drop view TARGET.foo2;
compute stats TARGET.result_result; ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
-- datasources -- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource; create view if not exists TARGET.datasource as select * from SOURCE.datasource;
@ -140,7 +148,7 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
compute stats TARGET.datasource_results; ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
-- organizations -- organizations
create view if not exists TARGET.organization as select * from SOURCE.organization; create view if not exists TARGET.organization as select * from SOURCE.organization;
@ -157,28 +165,28 @@ create view if not exists TARGET.project_resultcount as select * from SOURCE.pro
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
compute stats TARGET.project_results; ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
-- indicators -- indicators
-- Sprint 1 ---- -- Sprint 1 ----
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_green_oa; ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_grey_lit; ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_doi_from_crossref; ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
-- Sprint 2 ---- -- Sprint 2 ----
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence; ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence_url; ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_has_abstract; ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_with_orcid; ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
---- Sprint 3 ---- ---- Sprint 3 ----
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_funded_result_with_fundref; ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
@ -187,30 +195,30 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
---- Sprint 4 ---- ---- Sprint 4 ----
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_diamond; ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_in_transformative; ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_closed_other_open; ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
---- Sprint 5 ---- ---- Sprint 5 ----
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_no_of_copies; ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ---- ---- Sprint 6 ----
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_hybrid_oa_with_cc; ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads; ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_datasource; ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_year; ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
compute stats TARGET.indi_pub_downloads_datasource_year; ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
---- Sprint 7 ---- ---- Sprint 7 ----
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_gold_oa; ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_pub_hybrid; ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
@ -221,11 +229,12 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--compute stats TARGET.indi_datasets_gold_oa; --compute stats TARGET.indi_datasets_gold_oa;
--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--compute stats TARGET.indi_software_gold_oa; --compute stats TARGET.indi_software_gold_oa;

View File

@ -8,6 +8,8 @@ from ${stats_db_name}.result r
group by rl.id group by rl.id
) rln on rln.id=r.id; ) rln on rln.id=r.id;
ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_country stored as parquet as create table ${observatory_db_name}.result_affiliated_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -37,6 +39,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_year stored as parquet as create table ${observatory_db_name}.result_affiliated_year stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -66,6 +70,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -95,6 +101,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -126,6 +134,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -157,6 +167,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_organization stored as parquet as create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -186,6 +198,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -215,6 +229,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_funder stored as parquet as create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -246,6 +262,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS;
create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -277,6 +295,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_country stored as parquet as create table ${observatory_db_name}.result_deposited_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -308,6 +328,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_year stored as parquet as create table ${observatory_db_name}.result_deposited_year stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -339,6 +361,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_year_country stored as parquet as create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -370,6 +394,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_datasource stored as parquet as create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -401,6 +427,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -432,6 +460,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_organization stored as parquet as create table ${observatory_db_name}.result_deposited_organization stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -463,6 +493,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -494,6 +526,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_funder stored as parquet as create table ${observatory_db_name}.result_deposited_funder stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -527,6 +561,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS;
create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
select select
count(distinct r.id) as total, count(distinct r.id) as total,
@ -559,3 +595,5 @@ from ${stats_db_name}.result r
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS;

View File

@ -96,6 +96,6 @@ select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results STORED AS PARQUET AS CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources; FROM ${stats_db_name}.result_datasources;

View File

@ -74,7 +74,7 @@
</configuration> </configuration>
</global> </global>
<start to="Contexts"/> <start to="Step1"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -302,23 +302,23 @@
<param>stats_db_name=${stats_db_name}</param> <param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param> <param>openaire_db_name=${openaire_db_name}</param>
</hive2> </hive2>
<ok to="Step19-finalize"/>
<error to="Kill"/>
</action>
<action name="Step19-finalize">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizedb.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${stats_db_shadow_name}</argument>
<file>finalizedb.sh</file>
</shell>
<ok to="step20-createMonitorDB"/> <ok to="step20-createMonitorDB"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="Step19-finalize">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>finalizedb.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${stats_db_shadow_name}</argument>-->
<!-- <file>finalizedb.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step20-createMonitorDB"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="step20-createMonitorDB"> <action name="step20-createMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
@ -355,67 +355,67 @@
<param>stats_db_name=${stats_db_name}</param> <param>stats_db_name=${stats_db_name}</param>
<param>observatory_db_name=${observatory_db_name}</param> <param>observatory_db_name=${observatory_db_name}</param>
</hive2> </hive2>
<ok to="step21-createObservatoryDB-post"/>
<error to="Kill"/>
</action>
<action name="step21-createObservatoryDB-post">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>observatory-post.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<file>observatory-post.sh</file>
</shell>
<ok to="step22-copyDataToImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="step22-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<argument>${external_stats_db_name}</argument>
<argument>${stats_db_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${observatory_db_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="step23-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="step23-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${stats_db_shadow_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="Step24-updateCache"/>
<error to="Kill"/>
</action>
<action name="Step24-updateCache">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>updateCache.sh</exec>
<argument>${stats_tool_api_url}</argument>
<file>updateCache.sh</file>
</shell>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="step21-createObservatoryDB-post">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>observatory-post.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <argument>${observatory_db_shadow_name}</argument>-->
<!-- <file>observatory-post.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step22-copyDataToImpalaCluster"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<!-- <action name="step22-copyDataToImpalaCluster">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>copyDataToImpalaCluster.sh</exec>-->
<!-- <argument>${external_stats_db_name}</argument>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <file>copyDataToImpalaCluster.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step23-finalizeImpalaCluster"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<!-- <action name="step23-finalizeImpalaCluster">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>finalizeImpalaCluster.sh</exec>-->
<!-- <argument>${stats_db_name}</argument>-->
<!-- <argument>${stats_db_shadow_name}</argument>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${monitor_db_shadow_name}</argument>-->
<!-- <argument>${observatory_db_name}</argument>-->
<!-- <argument>${observatory_db_shadow_name}</argument>-->
<!-- <file>finalizeImpalaCluster.sh</file>-->
<!-- </shell>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<!-- <action name="Step24-updateCache">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>updateCache.sh</exec>-->
<!-- <argument>${stats_tool_api_url}</argument>-->
<!-- <file>updateCache.sh</file>-->
<!-- </shell>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>