From ccee451ddeb84ef73e15ba1f853ff7a72f6861c6 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 7 Sep 2021 23:17:13 +0300 Subject: [PATCH 01/14] added indicators of sprint 2 in monitor db --- .../scripts/step20-createMonitorDB.sql | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 5da028304..9ea50d488 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -104,25 +104,42 @@ create table TARGET.project_results as select id as result, project as id from T compute stats TARGET.project_results; -- indicators -create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_green_oa; - -create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_grey_lit; - -create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_doi_from_crossref; - -create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_gold_oa; - +create view TARGET.indi_dataset_avg_year_content_oa as select * from SOURCE.indi_dataset_avg_year_content_oa orig; +create view TARGET.indi_dataset_avg_year_context_oa as select * from SOURCE.indi_dataset_avg_year_context_oa orig; create view TARGET.indi_dataset_avg_year_country_oa as select * from SOURCE.indi_dataset_avg_year_country_oa orig; + +create view TARGET.indi_other_avg_year_content_oa as select * from SOURCE.indi_other_avg_year_content_oa orig; +create view TARGET.indi_other_avg_year_context_oa as select * from SOURCE.indi_other_avg_year_context_oa orig; +create view TARGET.indi_other_avg_year_country_oa as select * from SOURCE.indi_other_avg_year_country_oa orig; + create view TARGET.indi_project_datasets_count as select * from SOURCE.indi_project_datasets_count orig; create view TARGET.indi_project_otherresearch_count as select * from SOURCE.indi_project_otherresearch_count orig; create view TARGET.indi_project_pubs_count as select * from SOURCE.indi_project_pubs_count orig; create view TARGET.indi_project_software_count as select * from SOURCE.indi_project_software_count orig; + +create view TARGET.indi_pub_avg_year_content_oa as select * from SOURCE.indi_pub_avg_year_content_oa orig; +create view TARGET.indi_pub_avg_year_context_oa as select * from SOURCE.indi_pub_avg_year_context_oa orig; create view TARGET.indi_pub_avg_year_country_oa as select * from SOURCE.indi_pub_avg_year_country_oa orig; +create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_green_oa; +create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_grey_lit; +create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_doi_from_crossref; +create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_gold_oa; +create table TARGET.indi_pub_has_abstract as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_abstract; +create table TARGET.indi_pub_has_cc_licence as select * from SOURCE.indi_pub_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_cc_licence; +create table TARGET.indi_pub_has_cc_licence_url as select * from SOURCE.indi_pub_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_cc_licence_url; + +create view TARGET.indi_software_avg_year_content_oa as select * from SOURCE.indi_software_avg_year_content_oa orig; +create view TARGET.indi_software_avg_year_context_oa as select * from SOURCE.indi_software_avg_year_context_oa orig; +create view TARGET.indi_software_avg_year_country_oa as select * from SOURCE.indi_software_avg_year_country_oa orig; + --denorm alter table TARGET.result rename to TARGET.res_tmp; From 1250ae197f1cdc7865335b7a6005a8f2f611ea96 Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 8 Sep 2021 14:08:43 +0300 Subject: [PATCH 02/14] using new indicators for the definition of peerreviewed, gold, and green --- ....sql => step16-createIndicatorsTables.sql} | 0 .../graph/stats/oozie_app/scripts/step16.sql | 62 ------------------- .../scripts/step16_1-definitions.sql | 22 +++++++ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 40 ++++++------ 4 files changed, 42 insertions(+), 82 deletions(-) rename dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/{step16_7-createIndicatorsTables.sql => step16-createIndicatorsTables.sql} (100%) delete mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql similarity index 100% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql deleted file mode 100644 index 481fd9e8c..000000000 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql +++ /dev/null @@ -1,62 +0,0 @@ ----------------------------------------------------- --- Shortcuts for various definitions in stats db --- ----------------------------------------------------- - --- Peer reviewed: --- Results that have been collected from Crossref -create table ${stats_db_name}.result_peerreviewed as -with peer_reviewed as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_sources rs on rs.id=r.id - join ${stats_db_name}.datasource d on d.id=rs.datasource - where d.name='Crossref') -select distinct peer_reviewed.id as id, true as peer_reviewed -from peer_reviewed -union all -select distinct r.id as id, false as peer_reviewed -from ${stats_db_name}.result r -left outer join peer_reviewed pr on pr.id=r.id -where pr.id is null; - --- Green OA: --- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal. -create table ${stats_db_name}.result_greenoa as -with result_green as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_datasources rd on rd.id=r.id - join ${stats_db_name}.datasource d on d.id=rd.datasource - left outer join ( - select rd.id from ${stats_db_name}.result_datasources rd - join ${stats_db_name}.datasource d on rd.datasource=d.id - join ${stats_db_name}.datasource_sources sds on sds.id=d.id - join ${stats_db_name}.datasource sd on sd.id=sds.datasource - where sd.name='DOAJ-ARTICLES' - ) as doaj on doaj.id=r.id - where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and doaj.id is null) -select distinct result_green.id, true as green -from result_green -union all -select distinct r.id as id, false as green -from ${stats_db_name}.result r -left outer join result_green rg on rg.id=r.id -where rg.id is null; - --- GOLD OA: --- OA results that have been harvested from a DOAJ journal. -create table ${stats_db_name}.result_gold as -with result_gold as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_datasources rd on rd.id=r.id - join ${stats_db_name}.datasource d on d.id=rd.datasource - join ${stats_db_name}.datasource_sources sds on sds.id=d.id - join ${stats_db_name}.datasource sd on sd.id=sds.datasource - where r.type='publication' and r.bestlicence='Open Access' and sd.name='DOAJ-Articles') -select distinct result_gold.id, true as gold -from result_gold -union all -select distinct r.id, false as gold -from ${stats_db_name}.result r -where r.id not in (select id from result_gold); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql new file mode 100644 index 000000000..484e0772c --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -0,0 +1,22 @@ +---------------------------------------------------- +-- Shortcuts for various definitions in stats db --- +---------------------------------------------------- + +-- Peer reviewed: +create table ${stats_db_name}.result_peerreviewed as +select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed +from result r +left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id +left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; + +-- Green OA: +create table ${stats_db_name}.result_greenoa as +select r.id, case when green.green_oa=1 then true else false end as green +from ${stats_db_name}.result r +left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; + +-- GOLD OA: +create table ${stats_db_name}.result_gold as +select r.id, case when green.green_oa=1 then true else false end as green +from ${stats_db_name}.result r + left outer join ${stats_db_name}.indi_pub_gold_oa green on green.id=r.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index a329ca4bf..6d618e489 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -239,14 +239,27 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - + - + + + ${jobTracker} + ${nameNode} + indicators.sh + ${stats_db_name} + ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql + indicators.sh + + + + + + ${hive_jdbc_url} - + stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} @@ -272,24 +285,11 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - + - - - ${jobTracker} - ${nameNode} - indicators.sh - ${stats_db_name} - ${wf:appPath()}/scripts/step16_7-createIndicatorsTables.sql - indicators.sh - - - - - - + ${jobTracker} ${nameNode} @@ -298,11 +298,11 @@ ${stats_db_name} contexts.sh - + - + ${jobTracker} ${nameNode} From c6ada217a19a7b96754417e67df421b9a07987bd Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 8 Sep 2021 22:34:59 +0300 Subject: [PATCH 03/14] fixed typo --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index fb944f4ff..93faa43d6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -15,5 +15,5 @@ hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" impala-shell -q "invalidate metadata" impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - -cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f - +cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - echo "Indicators created" \ No newline at end of file From f13cca7e83f2b3f7328f5a3e0c111eee0d9879e3 Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 8 Sep 2021 23:07:58 +0300 Subject: [PATCH 04/14] moved dependencies of indicators before them... --- .../scripts/{step16_6.sql => step15_5.sql} | 0 .../dhp/oa/graph/stats/oozie_app/workflow.xml | 22 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) rename dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/{step16_6.sql => step15_5.sql} (100%) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql similarity index 100% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 6d618e489..5d18ad3e0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -239,6 +239,17 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + + + + + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + @@ -274,17 +285,6 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - - - - - - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - From 43852bac0eb65769052b6a153c57940c8e3ed549 Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 13 Sep 2021 01:36:41 +0300 Subject: [PATCH 05/14] creating other::other concept for all contexts --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index 6c5823b0c..6d42ab13d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -16,7 +16,7 @@ curl -L ${CONTEXT_API}/contexts/?type=ri,community -H "accept: application/json" cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv -cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv +cat categories.csv | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv echo "uploading context data to hdfs" hdfs dfs -mkdir ${TMP} From 461bf90ca6eec21e45866e0a89db2ed5728afeea Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 13 Sep 2021 11:10:30 +0300 Subject: [PATCH 06/14] fixed the gold_oa definition --- .../oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 484e0772c..6e2d9a262 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -17,6 +17,6 @@ left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; -- GOLD OA: create table ${stats_db_name}.result_gold as -select r.id, case when green.green_oa=1 then true else false end as green +select r.id, case when gold.gold_oa=1 then true else false end as gold from ${stats_db_name}.result r - left outer join ${stats_db_name}.indi_pub_gold_oa green on green.id=r.id; \ No newline at end of file + left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file From 8fc89ae82278ce169fbd95c568d0a5d9a003a709 Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 13 Sep 2021 14:33:23 +0300 Subject: [PATCH 07/14] moved context table creation before indicators --- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 5d18ad3e0..8fe05a933 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -250,6 +250,19 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + + + + + + + ${jobTracker} + ${nameNode} + contexts.sh + ${context_api_url} + ${stats_db_name} + contexts.sh + @@ -285,19 +298,6 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - - - - - - - ${jobTracker} - ${nameNode} - contexts.sh - ${context_api_url} - ${stats_db_name} - contexts.sh - From 9b1936701c852860f225c88b663fc4634668e175 Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 13 Sep 2021 21:07:44 +0300 Subject: [PATCH 08/14] fixed yet another typo --- .../oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 6e2d9a262..6b4d9b1b0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -5,7 +5,7 @@ -- Peer reviewed: create table ${stats_db_name}.result_peerreviewed as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed -from result r +from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; From de9bf3a161931e0e7fa4baa7ae12f5c84e70de5f Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 14 Sep 2021 01:29:08 +0300 Subject: [PATCH 09/14] added cc_licences and abstracts in observatory db --- .../graph/stats/oozie_app/scripts/step10.sql | 5 + .../scripts/step21-createObservatoryDB.sql | 92 +++++++++++-------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 77fbd3b18..fc0162a9c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -23,6 +23,11 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture; +CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS +SELECT * +FROM ${external_stats_db_name}.licenses_normalized; + + ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 40cdf3f6d..f0e5a8dab 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,41 +1,44 @@ create table TARGET.result_affiliated_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; create table TARGET.result_affiliated_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; create table TARGET.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; create table TARGET.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -44,12 +47,13 @@ left outer join SOURCE.result_datasources rd on rd.id=r.id left outer join SOURCE.datasource d on d.id=rd.datasource left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; create table TARGET.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -58,35 +62,38 @@ left outer join SOURCE.result_datasources rd on rd.id=r.id left outer join SOURCE.datasource d on d.id=rd.datasource left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; create table TARGET.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; create table TARGET.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; create table TARGET.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -95,12 +102,13 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; create table TARGET.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -109,12 +117,13 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; create table TARGET.result_deposited_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -123,11 +132,12 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; create table TARGET.result_deposited_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -136,12 +146,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; create table TARGET.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -150,12 +161,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; create table TARGET.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -164,12 +176,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; create table TARGET.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -178,11 +191,12 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; create table TARGET.result_deposited_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -191,12 +205,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; create table TARGET.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -205,12 +220,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; create table TARGET.result_deposited_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -221,12 +237,13 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; create table TARGET.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -237,7 +254,8 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; compute stats TARGET.result_affiliated_country; compute stats TARGET.result_affiliated_year; @@ -256,4 +274,4 @@ compute stats TARGET.result_deposited_datasource_country; compute stats TARGET.result_deposited_organization; compute stats TARGET.result_deposited_organization_country; compute stats TARGET.result_deposited_funder; -compute stats TARGET.result_deposited_funder_country; +compute stats TARGET.result_deposited_funder_country; \ No newline at end of file From dd2329849f0cf2455e34321f59e31c8d97235f7c Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 16 Sep 2021 13:50:34 +0300 Subject: [PATCH 10/14] fixed the definition of cc_licence --- .../scripts/step21-createObservatoryDB.sql | 167 ++++++++++++++---- 1 file changed, 131 insertions(+), 36 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index f0e5a8dab..7c344b903 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,44 +1,61 @@ create table TARGET.result_affiliated_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; create table TARGET.result_affiliated_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; create table TARGET.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; create table TARGET.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -47,13 +64,18 @@ left outer join SOURCE.result_datasources rd on rd.id=r.id left outer join SOURCE.datasource d on d.id=rd.datasource left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; create table TARGET.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -62,38 +84,54 @@ left outer join SOURCE.result_datasources rd on rd.id=r.id left outer join SOURCE.datasource d on d.id=rd.datasource left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; create table TARGET.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; create table TARGET.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; create table TARGET.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -102,13 +140,18 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; create table TARGET.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -117,13 +160,18 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; create table TARGET.result_deposited_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -132,12 +180,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; create table TARGET.result_deposited_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -146,13 +200,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; create table TARGET.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -161,13 +220,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; create table TARGET.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -176,13 +240,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; create table TARGET.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -191,12 +260,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; create table TARGET.result_deposited_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -205,13 +280,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; create table TARGET.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -220,13 +300,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; create table TARGET.result_deposited_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -237,13 +322,18 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; create table TARGET.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -254,7 +344,12 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; compute stats TARGET.result_affiliated_country; From 2943287d1005ebb004c9068d8b27ef046a935543 Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 16 Sep 2021 15:59:06 +0300 Subject: [PATCH 11/14] fixed the definition of cc_licence, part II --- .../scripts/step21-createObservatoryDB.sql | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 7c344b903..d71978a30 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,7 +1,7 @@ create table TARGET.result_affiliated_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -19,7 +19,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -37,7 +37,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -55,7 +55,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -75,7 +75,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -95,7 +95,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -113,7 +113,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -131,7 +131,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -151,7 +151,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -171,7 +171,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -191,7 +191,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -211,7 +211,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -231,7 +231,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -251,7 +251,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -271,7 +271,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -291,7 +291,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -311,7 +311,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -333,7 +333,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') From 8b681dcf1b99cd7fa0c1046abe52b3b35622bf83 Mon Sep 17 00:00:00 2001 From: antleb Date: Sat, 18 Sep 2021 00:35:14 +0300 Subject: [PATCH 12/14] attempt to make the observatory wf run in hive --- .../oa/graph/stats/oozie_app/observatory.sh | 4 +- .../scripts/step21-createObservatoryDB.sql | 840 +++++++++++------- 2 files changed, 527 insertions(+), 317 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh index ff03bca03..7db8d40a5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh @@ -18,7 +18,9 @@ echo "Creating observatory database" impala-shell -q "drop database if exists ${TARGET} cascade" impala-shell -q "create database if not exists ${TARGET}" impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - -cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | hive -f - +impala-shell -q "invalidate metadata;" +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - echo "Impala shell finished" echo "Updating shadow observatory database" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index d71978a30..f17b5358f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,372 +1,580 @@ -create table TARGET.result_affiliated_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname +create table TARGET.result_cc_licence stored as parquet as +select r.id, coalesce(rln.count, 0) > 0 as cc_licence from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( + left outer join ( select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; +) rln on rln.id=r.id; + +create table TARGET.result_affiliated_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + c.code as ccode, c.name as cname +from SOURCE.result r + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; create table TARGET.result_affiliated_year stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; create table TARGET.result_affiliated_year_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; create table TARGET.result_affiliated_datasource stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_datasources rd on rd.id=r.id -left outer join SOURCE.datasource d on d.id=rd.datasource -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_datasources rd on rd.id=r.id + left outer join SOURCE.datasource d on d.id=rd.datasource + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; create table TARGET.result_affiliated_datasource_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_datasources rd on rd.id=r.id -left outer join SOURCE.datasource d on d.id=rd.datasource -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_datasources rd on rd.id=r.id + left outer join SOURCE.datasource d on d.id=rd.datasource + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; create table TARGET.result_affiliated_organization stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; create table TARGET.result_affiliated_organization_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; create table TARGET.result_affiliated_funder stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + join SOURCE.result_projects rp on rp.id=r.id + join SOURCE.project p on p.id=rp.project + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; create table TARGET.result_affiliated_funder_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + join SOURCE.result_projects rp on rp.id=r.id + join SOURCE.project p on p.id=rp.project + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; create table TARGET.result_deposited_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; create table TARGET.result_deposited_year stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; create table TARGET.result_deposited_year_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; create table TARGET.result_deposited_datasource stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; create table TARGET.result_deposited_datasource_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; create table TARGET.result_deposited_organization stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; create table TARGET.result_deposited_organization_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; create table TARGET.result_deposited_funder stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + join SOURCE.result_projects rp on rp.id=r.id + join SOURCE.project p on p.id=rp.project + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; create table TARGET.result_deposited_funder_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + join SOURCE.result_projects rp on rp.id=r.id + join SOURCE.project p on p.id=rp.project + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; -compute stats TARGET.result_affiliated_country; -compute stats TARGET.result_affiliated_year; -compute stats TARGET.result_affiliated_year_country; -compute stats TARGET.result_affiliated_datasource; -compute stats TARGET.result_affiliated_datasource_country; -compute stats TARGET.result_affiliated_organization; -compute stats TARGET.result_affiliated_organization_country; -compute stats TARGET.result_affiliated_funder; -compute stats TARGET.result_affiliated_funder_country; -compute stats TARGET.result_deposited_country; -compute stats TARGET.result_deposited_year; -compute stats TARGET.result_deposited_year_country; -compute stats TARGET.result_deposited_datasource; -compute stats TARGET.result_deposited_datasource_country; -compute stats TARGET.result_deposited_organization; -compute stats TARGET.result_deposited_organization_country; -compute stats TARGET.result_deposited_funder; -compute stats TARGET.result_deposited_funder_country; \ No newline at end of file +-- compute stats TARGET.result_affiliated_country; +-- compute stats TARGET.result_affiliated_year; +-- compute stats TARGET.result_affiliated_year_country; +-- compute stats TARGET.result_affiliated_datasource; +-- compute stats TARGET.result_affiliated_datasource_country; +-- compute stats TARGET.result_affiliated_organization; +-- compute stats TARGET.result_affiliated_organization_country; +-- compute stats TARGET.result_affiliated_funder; +-- compute stats TARGET.result_affiliated_funder_country; +-- compute stats TARGET.result_deposited_country; +-- compute stats TARGET.result_deposited_year; +-- compute stats TARGET.result_deposited_year_country; +-- compute stats TARGET.result_deposited_datasource; +-- compute stats TARGET.result_deposited_datasource_country; +-- compute stats TARGET.result_deposited_organization; +-- compute stats TARGET.result_deposited_organization_country; +-- compute stats TARGET.result_deposited_funder; +-- compute stats TARGET.result_deposited_funder_country; \ No newline at end of file From 421d55265d7c567092bbf418cdce6b818a9dbd77 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 21 Sep 2021 03:07:58 +0300 Subject: [PATCH 13/14] created hive action for observatory queries --- .../{observatory.sh => observatory-post.sh} | 9 - .../graph/stats/oozie_app/observatory-pre.sh | 16 + .../scripts/step21-createObservatoryDB.sql | 449 +++++++++--------- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 30 +- 4 files changed, 258 insertions(+), 246 deletions(-) rename dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/{observatory.sh => observatory-post.sh} (63%) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh similarity index 63% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index 7db8d40a5..db8d39af2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -9,16 +9,7 @@ fi export SOURCE=$1 export TARGET=$2 export SHADOW=$3 -export SCRIPT_PATH=$4 -echo "Getting file from " $4 -hdfs dfs -copyToLocal $4 - -echo "Creating observatory database" -impala-shell -q "drop database if exists ${TARGET} cascade" -impala-shell -q "create database if not exists ${TARGET}" -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - -cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | hive -f - impala-shell -q "invalidate metadata;" impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - echo "Impala shell finished" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh new file mode 100644 index 000000000..92543b8b8 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -0,0 +1,16 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 + +echo "Creating observatory database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index f17b5358f..e0bdcd685 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,14 +1,14 @@ -create table TARGET.result_cc_licence stored as parquet as +create table ${observatory_db_name}.result_cc_licence stored as parquet as select r.id, coalesce(rln.count, 0) > 0 as cc_licence -from SOURCE.result r +from ${stats_db_name}.result r left outer join ( select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + from ${stats_db_name}.result_licenses rl + left outer join ${stats_db_name}.licenses_normalized rln on rl.type=rln.license group by rl.id ) rln on rln.id=r.id; -create table TARGET.result_affiliated_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -24,20 +24,20 @@ select rfc.count > 1 as multiple_funders, r.type, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; -create table TARGET.result_affiliated_year stored as parquet as +create table ${observatory_db_name}.result_affiliated_year stored as parquet as select count(distinct r.id) as total, r.green, @@ -53,20 +53,20 @@ select rfc.count > 1 as multiple_funders, r.type, r.year -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; -create table TARGET.result_affiliated_year_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -82,20 +82,20 @@ select rfc.count > 1 as multiple_funders, r.type, r.year, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; -create table TARGET.result_affiliated_datasource stored as parquet as +create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, r.green, @@ -111,22 +111,22 @@ select rfc.count > 1 as multiple_funders, r.type, d.name as dname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_datasources rd on rd.id=r.id - left outer join SOURCE.datasource d on d.id=rd.datasource - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id + left outer join ${stats_db_name}.datasource d on d.id=rd.datasource + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; -create table TARGET.result_affiliated_datasource_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -142,22 +142,22 @@ select rfc.count > 1 as multiple_funders, r.type, d.name as dname, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_datasources rd on rd.id=r.id - left outer join SOURCE.datasource d on d.id=rd.datasource - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id + left outer join ${stats_db_name}.datasource d on d.id=rd.datasource + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; -create table TARGET.result_affiliated_organization stored as parquet as +create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, r.green, @@ -173,20 +173,20 @@ select rfc.count > 1 as multiple_funders, r.type, o.name as oname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; -create table TARGET.result_affiliated_organization_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -202,20 +202,20 @@ select rfc.count > 1 as multiple_funders, r.type, o.name as oname, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; -create table TARGET.result_affiliated_funder stored as parquet as +create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, r.green, @@ -231,22 +231,22 @@ select rfc.count > 1 as multiple_funders, r.type, p.funder as pfunder -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - join SOURCE.result_projects rp on rp.id=r.id - join SOURCE.project p on p.id=rp.project - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; -create table TARGET.result_affiliated_funder_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -262,22 +262,22 @@ select rfc.count > 1 as multiple_funders, r.type, p.funder as pfunder, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - join SOURCE.result_projects rp on rp.id=r.id - join SOURCE.project p on p.id=rp.project - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; -create table TARGET.result_deposited_country stored as parquet as +create table ${observatory_db_name}.result_deposited_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -293,22 +293,22 @@ select rfc.count > 1 as multiple_funders, r.type, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; -create table TARGET.result_deposited_year stored as parquet as +create table ${observatory_db_name}.result_deposited_year stored as parquet as select count(distinct r.id) as total, r.green, @@ -324,22 +324,22 @@ select rfc.count > 1 as multiple_funders, r.type, r.year -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; -create table TARGET.result_deposited_year_country stored as parquet as +create table ${observatory_db_name}.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -355,22 +355,22 @@ select rfc.count > 1 as multiple_funders, r.type, r.year, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; -create table TARGET.result_deposited_datasource stored as parquet as +create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, r.green, @@ -386,22 +386,22 @@ select rfc.count > 1 as multiple_funders, r.type, d.name as dname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; -create table TARGET.result_deposited_datasource_country stored as parquet as +create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -417,22 +417,22 @@ select rfc.count > 1 as multiple_funders, r.type, d.name as dname, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; -create table TARGET.result_deposited_organization stored as parquet as +create table ${observatory_db_name}.result_deposited_organization stored as parquet as select count(distinct r.id) as total, r.green, @@ -448,22 +448,22 @@ select rfc.count > 1 as multiple_funders, r.type, o.name as oname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; -create table TARGET.result_deposited_organization_country stored as parquet as +create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -479,22 +479,22 @@ select rfc.count > 1 as multiple_funders, r.type, o.name as oname, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; -create table TARGET.result_deposited_funder stored as parquet as +create table ${observatory_db_name}.result_deposited_funder stored as parquet as select count(distinct r.id) as total, r.green, @@ -510,24 +510,24 @@ select rfc.count > 1 as multiple_funders, r.type, p.funder as pfunder -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - join SOURCE.result_projects rp on rp.id=r.id - join SOURCE.project p on p.id=rp.project - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; -create table TARGET.result_deposited_funder_country stored as parquet as +create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -543,38 +543,19 @@ select rfc.count > 1 as multiple_funders, r.type, p.funder as pfunder, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - join SOURCE.result_projects rp on rp.id=r.id - join SOURCE.project p on p.id=rp.project - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; - --- compute stats TARGET.result_affiliated_country; --- compute stats TARGET.result_affiliated_year; --- compute stats TARGET.result_affiliated_year_country; --- compute stats TARGET.result_affiliated_datasource; --- compute stats TARGET.result_affiliated_datasource_country; --- compute stats TARGET.result_affiliated_organization; --- compute stats TARGET.result_affiliated_organization_country; --- compute stats TARGET.result_affiliated_funder; --- compute stats TARGET.result_affiliated_funder_country; --- compute stats TARGET.result_deposited_country; --- compute stats TARGET.result_deposited_year; --- compute stats TARGET.result_deposited_year_country; --- compute stats TARGET.result_deposited_datasource; --- compute stats TARGET.result_deposited_datasource_country; --- compute stats TARGET.result_deposited_organization; --- compute stats TARGET.result_deposited_organization_country; --- compute stats TARGET.result_deposited_funder; --- compute stats TARGET.result_deposited_funder_country; \ No newline at end of file + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 8fe05a933..08d33f4e8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -326,20 +326,44 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh + + + + + + + ${jobTracker} + ${nameNode} + observatory-pre.sh + ${stats_db_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + observatory-pre.sh + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + observatory_db_name=${observatory_db_name} + + + + + + ${jobTracker} ${nameNode} - observatory.sh + observatory-post.sh ${stats_db_name} ${observatory_db_name} ${observatory_db_shadow_name} - ${wf:appPath()}/scripts/step21-createObservatoryDB.sql - observatory.sh + observatory-post.sh From f358cabb2bc63be8d31a069afbe476465a4c1038 Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 22 Sep 2021 21:50:37 +0300 Subject: [PATCH 14/14] fixed typo --- .../scripts/step21-createObservatoryDB.sql | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index e0bdcd685..e24370e7d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -30,7 +30,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -59,7 +59,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -88,7 +88,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -119,7 +119,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.datasource d on d.id=rd.datasource left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -150,7 +150,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.datasource d on d.id=rd.datasource left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -179,7 +179,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -208,7 +208,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -239,7 +239,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -270,7 +270,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -301,7 +301,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -332,7 +332,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -363,7 +363,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -394,7 +394,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -425,7 +425,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -456,7 +456,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -487,7 +487,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -520,7 +520,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -553,7 +553,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,