diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index 6c5823b0c..6d42ab13d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -16,7 +16,7 @@ curl -L ${CONTEXT_API}/contexts/?type=ri,community -H "accept: application/json" cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv -cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv +cat categories.csv | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv echo "uploading context data to hdfs" hdfs dfs -mkdir ${TMP} diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index fb944f4ff..93faa43d6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -15,5 +15,5 @@ hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" impala-shell -q "invalidate metadata" impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - -cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f - +cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - echo "Indicators created" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh similarity index 58% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index ff03bca03..db8d39af2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -9,16 +9,9 @@ fi export SOURCE=$1 export TARGET=$2 export SHADOW=$3 -export SCRIPT_PATH=$4 -echo "Getting file from " $4 -hdfs dfs -copyToLocal $4 - -echo "Creating observatory database" -impala-shell -q "drop database if exists ${TARGET} cascade" -impala-shell -q "create database if not exists ${TARGET}" -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - -cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +impala-shell -q "invalidate metadata;" +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - echo "Impala shell finished" echo "Updating shadow observatory database" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh new file mode 100644 index 000000000..92543b8b8 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -0,0 +1,16 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 + +echo "Creating observatory database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 77fbd3b18..fc0162a9c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -23,6 +23,11 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture; +CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS +SELECT * +FROM ${external_stats_db_name}.licenses_normalized; + + ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql similarity index 100% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql similarity index 100% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql deleted file mode 100644 index 481fd9e8c..000000000 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql +++ /dev/null @@ -1,62 +0,0 @@ ----------------------------------------------------- --- Shortcuts for various definitions in stats db --- ----------------------------------------------------- - --- Peer reviewed: --- Results that have been collected from Crossref -create table ${stats_db_name}.result_peerreviewed as -with peer_reviewed as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_sources rs on rs.id=r.id - join ${stats_db_name}.datasource d on d.id=rs.datasource - where d.name='Crossref') -select distinct peer_reviewed.id as id, true as peer_reviewed -from peer_reviewed -union all -select distinct r.id as id, false as peer_reviewed -from ${stats_db_name}.result r -left outer join peer_reviewed pr on pr.id=r.id -where pr.id is null; - --- Green OA: --- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal. -create table ${stats_db_name}.result_greenoa as -with result_green as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_datasources rd on rd.id=r.id - join ${stats_db_name}.datasource d on d.id=rd.datasource - left outer join ( - select rd.id from ${stats_db_name}.result_datasources rd - join ${stats_db_name}.datasource d on rd.datasource=d.id - join ${stats_db_name}.datasource_sources sds on sds.id=d.id - join ${stats_db_name}.datasource sd on sd.id=sds.datasource - where sd.name='DOAJ-ARTICLES' - ) as doaj on doaj.id=r.id - where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and doaj.id is null) -select distinct result_green.id, true as green -from result_green -union all -select distinct r.id as id, false as green -from ${stats_db_name}.result r -left outer join result_green rg on rg.id=r.id -where rg.id is null; - --- GOLD OA: --- OA results that have been harvested from a DOAJ journal. -create table ${stats_db_name}.result_gold as -with result_gold as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_datasources rd on rd.id=r.id - join ${stats_db_name}.datasource d on d.id=rd.datasource - join ${stats_db_name}.datasource_sources sds on sds.id=d.id - join ${stats_db_name}.datasource sd on sd.id=sds.datasource - where r.type='publication' and r.bestlicence='Open Access' and sd.name='DOAJ-Articles') -select distinct result_gold.id, true as gold -from result_gold -union all -select distinct r.id, false as gold -from ${stats_db_name}.result r -where r.id not in (select id from result_gold); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql new file mode 100644 index 000000000..6b4d9b1b0 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -0,0 +1,22 @@ +---------------------------------------------------- +-- Shortcuts for various definitions in stats db --- +---------------------------------------------------- + +-- Peer reviewed: +create table ${stats_db_name}.result_peerreviewed as +select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed +from ${stats_db_name}.result r +left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id +left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; + +-- Green OA: +create table ${stats_db_name}.result_greenoa as +select r.id, case when green.green_oa=1 then true else false end as green +from ${stats_db_name}.result r +left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; + +-- GOLD OA: +create table ${stats_db_name}.result_gold as +select r.id, case when gold.gold_oa=1 then true else false end as gold +from ${stats_db_name}.result r + left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 5da028304..9ea50d488 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -104,25 +104,42 @@ create table TARGET.project_results as select id as result, project as id from T compute stats TARGET.project_results; -- indicators -create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_green_oa; - -create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_grey_lit; - -create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_doi_from_crossref; - -create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_gold_oa; - +create view TARGET.indi_dataset_avg_year_content_oa as select * from SOURCE.indi_dataset_avg_year_content_oa orig; +create view TARGET.indi_dataset_avg_year_context_oa as select * from SOURCE.indi_dataset_avg_year_context_oa orig; create view TARGET.indi_dataset_avg_year_country_oa as select * from SOURCE.indi_dataset_avg_year_country_oa orig; + +create view TARGET.indi_other_avg_year_content_oa as select * from SOURCE.indi_other_avg_year_content_oa orig; +create view TARGET.indi_other_avg_year_context_oa as select * from SOURCE.indi_other_avg_year_context_oa orig; +create view TARGET.indi_other_avg_year_country_oa as select * from SOURCE.indi_other_avg_year_country_oa orig; + create view TARGET.indi_project_datasets_count as select * from SOURCE.indi_project_datasets_count orig; create view TARGET.indi_project_otherresearch_count as select * from SOURCE.indi_project_otherresearch_count orig; create view TARGET.indi_project_pubs_count as select * from SOURCE.indi_project_pubs_count orig; create view TARGET.indi_project_software_count as select * from SOURCE.indi_project_software_count orig; + +create view TARGET.indi_pub_avg_year_content_oa as select * from SOURCE.indi_pub_avg_year_content_oa orig; +create view TARGET.indi_pub_avg_year_context_oa as select * from SOURCE.indi_pub_avg_year_context_oa orig; create view TARGET.indi_pub_avg_year_country_oa as select * from SOURCE.indi_pub_avg_year_country_oa orig; +create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_green_oa; +create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_grey_lit; +create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_doi_from_crossref; +create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_gold_oa; +create table TARGET.indi_pub_has_abstract as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_abstract; +create table TARGET.indi_pub_has_cc_licence as select * from SOURCE.indi_pub_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_cc_licence; +create table TARGET.indi_pub_has_cc_licence_url as select * from SOURCE.indi_pub_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_cc_licence_url; + +create view TARGET.indi_software_avg_year_content_oa as select * from SOURCE.indi_software_avg_year_content_oa orig; +create view TARGET.indi_software_avg_year_context_oa as select * from SOURCE.indi_software_avg_year_context_oa orig; +create view TARGET.indi_software_avg_year_country_oa as select * from SOURCE.indi_software_avg_year_country_oa orig; + --denorm alter table TARGET.result rename to TARGET.res_tmp; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 40cdf3f6d..e24370e7d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,259 +1,561 @@ -create table TARGET.result_affiliated_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; +create table ${observatory_db_name}.result_cc_licence stored as parquet as +select r.id, coalesce(rln.count, 0) > 0 as cc_licence +from ${stats_db_name}.result r + left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from ${stats_db_name}.result_licenses rl + left outer join ${stats_db_name}.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id; -create table TARGET.result_affiliated_year stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; +create table ${observatory_db_name}.result_affiliated_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; -create table TARGET.result_affiliated_year_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; +create table ${observatory_db_name}.result_affiliated_year stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; -create table TARGET.result_affiliated_datasource stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_datasources rd on rd.id=r.id -left outer join SOURCE.datasource d on d.id=rd.datasource -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; +create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year, c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; -create table TARGET.result_affiliated_datasource_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_datasources rd on rd.id=r.id -left outer join SOURCE.datasource d on d.id=rd.datasource -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; +create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id + left outer join ${stats_db_name}.datasource d on d.id=rd.datasource + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; -create table TARGET.result_affiliated_organization stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; +create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname, c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id + left outer join ${stats_db_name}.datasource d on d.id=rd.datasource + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; -create table TARGET.result_affiliated_organization_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; +create table ${observatory_db_name}.result_affiliated_organization stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; -create table TARGET.result_affiliated_funder stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; +create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname, c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; -create table TARGET.result_affiliated_funder_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; +create table ${observatory_db_name}.result_affiliated_funder stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; -create table TARGET.result_deposited_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; +create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder, c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; -create table TARGET.result_deposited_year stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; +create table ${observatory_db_name}.result_deposited_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; -create table TARGET.result_deposited_year_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; +create table ${observatory_db_name}.result_deposited_year stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; -create table TARGET.result_deposited_datasource stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; +create table ${observatory_db_name}.result_deposited_year_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year, c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; -create table TARGET.result_deposited_datasource_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; +create table ${observatory_db_name}.result_deposited_datasource stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; -create table TARGET.result_deposited_organization stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; +create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname, c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; -create table TARGET.result_deposited_organization_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; +create table ${observatory_db_name}.result_deposited_organization stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; -create table TARGET.result_deposited_funder stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; +create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname, c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; -create table TARGET.result_deposited_funder_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname -from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; +create table ${observatory_db_name}.result_deposited_funder stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; -compute stats TARGET.result_affiliated_country; -compute stats TARGET.result_affiliated_year; -compute stats TARGET.result_affiliated_year_country; -compute stats TARGET.result_affiliated_datasource; -compute stats TARGET.result_affiliated_datasource_country; -compute stats TARGET.result_affiliated_organization; -compute stats TARGET.result_affiliated_organization_country; -compute stats TARGET.result_affiliated_funder; -compute stats TARGET.result_affiliated_funder_country; -compute stats TARGET.result_deposited_country; -compute stats TARGET.result_deposited_year; -compute stats TARGET.result_deposited_year_country; -compute stats TARGET.result_deposited_datasource; -compute stats TARGET.result_deposited_datasource_country; -compute stats TARGET.result_deposited_organization; -compute stats TARGET.result_deposited_organization_country; -compute stats TARGET.result_deposited_funder; -compute stats TARGET.result_deposited_funder_country; +create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder, c.code as ccode, c.name as cname +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index a329ca4bf..08d33f4e8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -239,14 +239,51 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - + - + ${hive_jdbc_url} - + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + + + + + + + + ${jobTracker} + ${nameNode} + contexts.sh + ${context_api_url} + ${stats_db_name} + contexts.sh + + + + + + + + ${jobTracker} + ${nameNode} + indicators.sh + ${stats_db_name} + ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql + indicators.sh + + + + + + + + ${hive_jdbc_url} + stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} @@ -261,48 +298,11 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - - - - - - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - - - - - - - - ${jobTracker} - ${nameNode} - indicators.sh - ${stats_db_name} - ${wf:appPath()}/scripts/step16_7-createIndicatorsTables.sql - indicators.sh - - - - - - - - ${jobTracker} - ${nameNode} - contexts.sh - ${context_api_url} - ${stats_db_name} - contexts.sh - - + - + ${jobTracker} ${nameNode} @@ -326,20 +326,44 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh + + + + + + + ${jobTracker} + ${nameNode} + observatory-pre.sh + ${stats_db_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + observatory-pre.sh + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + observatory_db_name=${observatory_db_name} + + + + + + ${jobTracker} ${nameNode} - observatory.sh + observatory-post.sh ${stats_db_name} ${observatory_db_name} ${observatory_db_shadow_name} - ${wf:appPath()}/scripts/step21-createObservatoryDB.sql - observatory.sh + observatory-post.sh