From ccee451ddeb84ef73e15ba1f853ff7a72f6861c6 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 7 Sep 2021 23:17:13 +0300 Subject: [PATCH 01/18] added indicators of sprint 2 in monitor db --- .../scripts/step20-createMonitorDB.sql | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 5da028304..9ea50d488 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -104,25 +104,42 @@ create table TARGET.project_results as select id as result, project as id from T compute stats TARGET.project_results; -- indicators -create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_green_oa; - -create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_grey_lit; - -create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_doi_from_crossref; - -create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_gold_oa; - +create view TARGET.indi_dataset_avg_year_content_oa as select * from SOURCE.indi_dataset_avg_year_content_oa orig; +create view TARGET.indi_dataset_avg_year_context_oa as select * from SOURCE.indi_dataset_avg_year_context_oa orig; create view TARGET.indi_dataset_avg_year_country_oa as select * from SOURCE.indi_dataset_avg_year_country_oa orig; + +create view TARGET.indi_other_avg_year_content_oa as select * from SOURCE.indi_other_avg_year_content_oa orig; +create view TARGET.indi_other_avg_year_context_oa as select * from SOURCE.indi_other_avg_year_context_oa orig; +create view TARGET.indi_other_avg_year_country_oa as select * from SOURCE.indi_other_avg_year_country_oa orig; + create view TARGET.indi_project_datasets_count as select * from SOURCE.indi_project_datasets_count orig; create view TARGET.indi_project_otherresearch_count as select * from SOURCE.indi_project_otherresearch_count orig; create view TARGET.indi_project_pubs_count as select * from SOURCE.indi_project_pubs_count orig; create view TARGET.indi_project_software_count as select * from SOURCE.indi_project_software_count orig; + +create view TARGET.indi_pub_avg_year_content_oa as select * from SOURCE.indi_pub_avg_year_content_oa orig; +create view TARGET.indi_pub_avg_year_context_oa as select * from SOURCE.indi_pub_avg_year_context_oa orig; create view TARGET.indi_pub_avg_year_country_oa as select * from SOURCE.indi_pub_avg_year_country_oa orig; +create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_green_oa; +create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_grey_lit; +create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_doi_from_crossref; +create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_gold_oa; +create table TARGET.indi_pub_has_abstract as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_abstract; +create table TARGET.indi_pub_has_cc_licence as select * from SOURCE.indi_pub_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_cc_licence; +create table TARGET.indi_pub_has_cc_licence_url as select * from SOURCE.indi_pub_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_cc_licence_url; + +create view TARGET.indi_software_avg_year_content_oa as select * from SOURCE.indi_software_avg_year_content_oa orig; +create view TARGET.indi_software_avg_year_context_oa as select * from SOURCE.indi_software_avg_year_context_oa orig; +create view TARGET.indi_software_avg_year_country_oa as select * from SOURCE.indi_software_avg_year_country_oa orig; + --denorm alter table TARGET.result rename to TARGET.res_tmp; From 1250ae197f1cdc7865335b7a6005a8f2f611ea96 Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 8 Sep 2021 14:08:43 +0300 Subject: [PATCH 02/18] using new indicators for the definition of peerreviewed, gold, and green --- ....sql => step16-createIndicatorsTables.sql} | 0 .../graph/stats/oozie_app/scripts/step16.sql | 62 ------------------- .../scripts/step16_1-definitions.sql | 22 +++++++ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 40 ++++++------ 4 files changed, 42 insertions(+), 82 deletions(-) rename dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/{step16_7-createIndicatorsTables.sql => step16-createIndicatorsTables.sql} (100%) delete mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql similarity index 100% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql deleted file mode 100644 index 481fd9e8c..000000000 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql +++ /dev/null @@ -1,62 +0,0 @@ ----------------------------------------------------- --- Shortcuts for various definitions in stats db --- ----------------------------------------------------- - --- Peer reviewed: --- Results that have been collected from Crossref -create table ${stats_db_name}.result_peerreviewed as -with peer_reviewed as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_sources rs on rs.id=r.id - join ${stats_db_name}.datasource d on d.id=rs.datasource - where d.name='Crossref') -select distinct peer_reviewed.id as id, true as peer_reviewed -from peer_reviewed -union all -select distinct r.id as id, false as peer_reviewed -from ${stats_db_name}.result r -left outer join peer_reviewed pr on pr.id=r.id -where pr.id is null; - --- Green OA: --- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal. -create table ${stats_db_name}.result_greenoa as -with result_green as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_datasources rd on rd.id=r.id - join ${stats_db_name}.datasource d on d.id=rd.datasource - left outer join ( - select rd.id from ${stats_db_name}.result_datasources rd - join ${stats_db_name}.datasource d on rd.datasource=d.id - join ${stats_db_name}.datasource_sources sds on sds.id=d.id - join ${stats_db_name}.datasource sd on sd.id=sds.datasource - where sd.name='DOAJ-ARTICLES' - ) as doaj on doaj.id=r.id - where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and doaj.id is null) -select distinct result_green.id, true as green -from result_green -union all -select distinct r.id as id, false as green -from ${stats_db_name}.result r -left outer join result_green rg on rg.id=r.id -where rg.id is null; - --- GOLD OA: --- OA results that have been harvested from a DOAJ journal. -create table ${stats_db_name}.result_gold as -with result_gold as ( - select distinct r.id as id - from ${stats_db_name}.result r - join ${stats_db_name}.result_datasources rd on rd.id=r.id - join ${stats_db_name}.datasource d on d.id=rd.datasource - join ${stats_db_name}.datasource_sources sds on sds.id=d.id - join ${stats_db_name}.datasource sd on sd.id=sds.datasource - where r.type='publication' and r.bestlicence='Open Access' and sd.name='DOAJ-Articles') -select distinct result_gold.id, true as gold -from result_gold -union all -select distinct r.id, false as gold -from ${stats_db_name}.result r -where r.id not in (select id from result_gold); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql new file mode 100644 index 000000000..484e0772c --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -0,0 +1,22 @@ +---------------------------------------------------- +-- Shortcuts for various definitions in stats db --- +---------------------------------------------------- + +-- Peer reviewed: +create table ${stats_db_name}.result_peerreviewed as +select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed +from result r +left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id +left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; + +-- Green OA: +create table ${stats_db_name}.result_greenoa as +select r.id, case when green.green_oa=1 then true else false end as green +from ${stats_db_name}.result r +left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; + +-- GOLD OA: +create table ${stats_db_name}.result_gold as +select r.id, case when green.green_oa=1 then true else false end as green +from ${stats_db_name}.result r + left outer join ${stats_db_name}.indi_pub_gold_oa green on green.id=r.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index a329ca4bf..6d618e489 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -239,14 +239,27 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - + - + + + ${jobTracker} + ${nameNode} + indicators.sh + ${stats_db_name} + ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql + indicators.sh + + + + + + ${hive_jdbc_url} - + stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} @@ -272,24 +285,11 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - + - - - ${jobTracker} - ${nameNode} - indicators.sh - ${stats_db_name} - ${wf:appPath()}/scripts/step16_7-createIndicatorsTables.sql - indicators.sh - - - - - - + ${jobTracker} ${nameNode} @@ -298,11 +298,11 @@ ${stats_db_name} contexts.sh - + - + ${jobTracker} ${nameNode} From c6ada217a19a7b96754417e67df421b9a07987bd Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 8 Sep 2021 22:34:59 +0300 Subject: [PATCH 03/18] fixed typo --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index fb944f4ff..93faa43d6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -15,5 +15,5 @@ hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" impala-shell -q "invalidate metadata" impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - -cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f - +cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - echo "Indicators created" \ No newline at end of file From f13cca7e83f2b3f7328f5a3e0c111eee0d9879e3 Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 8 Sep 2021 23:07:58 +0300 Subject: [PATCH 04/18] moved dependencies of indicators before them... --- .../scripts/{step16_6.sql => step15_5.sql} | 0 .../dhp/oa/graph/stats/oozie_app/workflow.xml | 22 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) rename dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/{step16_6.sql => step15_5.sql} (100%) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql similarity index 100% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 6d618e489..5d18ad3e0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -239,6 +239,17 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + + + + + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + @@ -274,17 +285,6 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - - - - - - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - From 43852bac0eb65769052b6a153c57940c8e3ed549 Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 13 Sep 2021 01:36:41 +0300 Subject: [PATCH 05/18] creating other::other concept for all contexts --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index 6c5823b0c..6d42ab13d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -16,7 +16,7 @@ curl -L ${CONTEXT_API}/contexts/?type=ri,community -H "accept: application/json" cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv -cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv +cat categories.csv | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv echo "uploading context data to hdfs" hdfs dfs -mkdir ${TMP} From 461bf90ca6eec21e45866e0a89db2ed5728afeea Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 13 Sep 2021 11:10:30 +0300 Subject: [PATCH 06/18] fixed the gold_oa definition --- .../oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 484e0772c..6e2d9a262 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -17,6 +17,6 @@ left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; -- GOLD OA: create table ${stats_db_name}.result_gold as -select r.id, case when green.green_oa=1 then true else false end as green +select r.id, case when gold.gold_oa=1 then true else false end as gold from ${stats_db_name}.result r - left outer join ${stats_db_name}.indi_pub_gold_oa green on green.id=r.id; \ No newline at end of file + left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file From 8fc89ae82278ce169fbd95c568d0a5d9a003a709 Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 13 Sep 2021 14:33:23 +0300 Subject: [PATCH 07/18] moved context table creation before indicators --- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 5d18ad3e0..8fe05a933 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -250,6 +250,19 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + + + + + + + ${jobTracker} + ${nameNode} + contexts.sh + ${context_api_url} + ${stats_db_name} + contexts.sh + @@ -285,19 +298,6 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - - - - - - - ${jobTracker} - ${nameNode} - contexts.sh - ${context_api_url} - ${stats_db_name} - contexts.sh - From 9b1936701c852860f225c88b663fc4634668e175 Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 13 Sep 2021 21:07:44 +0300 Subject: [PATCH 08/18] fixed yet another typo --- .../oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 6e2d9a262..6b4d9b1b0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -5,7 +5,7 @@ -- Peer reviewed: create table ${stats_db_name}.result_peerreviewed as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed -from result r +from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; From de9bf3a161931e0e7fa4baa7ae12f5c84e70de5f Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 14 Sep 2021 01:29:08 +0300 Subject: [PATCH 09/18] added cc_licences and abstracts in observatory db --- .../graph/stats/oozie_app/scripts/step10.sql | 5 + .../scripts/step21-createObservatoryDB.sql | 92 +++++++++++-------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 77fbd3b18..fc0162a9c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -23,6 +23,11 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture; +CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS +SELECT * +FROM ${external_stats_db_name}.licenses_normalized; + + ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 40cdf3f6d..f0e5a8dab 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,41 +1,44 @@ create table TARGET.result_affiliated_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; create table TARGET.result_affiliated_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; create table TARGET.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; create table TARGET.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -44,12 +47,13 @@ left outer join SOURCE.result_datasources rd on rd.id=r.id left outer join SOURCE.datasource d on d.id=rd.datasource left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; create table TARGET.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -58,35 +62,38 @@ left outer join SOURCE.result_datasources rd on rd.id=r.id left outer join SOURCE.datasource d on d.id=rd.datasource left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; create table TARGET.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; create table TARGET.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; create table TARGET.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -95,12 +102,13 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; create table TARGET.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -109,12 +117,13 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; create table TARGET.result_deposited_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -123,11 +132,12 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; create table TARGET.result_deposited_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -136,12 +146,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; create table TARGET.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -150,12 +161,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; create table TARGET.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -164,12 +176,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; create table TARGET.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -178,11 +191,12 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; create table TARGET.result_deposited_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -191,12 +205,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; create table TARGET.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -205,12 +220,13 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; create table TARGET.result_deposited_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -221,12 +237,13 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; create table TARGET.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -237,7 +254,8 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; +left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; compute stats TARGET.result_affiliated_country; compute stats TARGET.result_affiliated_year; @@ -256,4 +274,4 @@ compute stats TARGET.result_deposited_datasource_country; compute stats TARGET.result_deposited_organization; compute stats TARGET.result_deposited_organization_country; compute stats TARGET.result_deposited_funder; -compute stats TARGET.result_deposited_funder_country; +compute stats TARGET.result_deposited_funder_country; \ No newline at end of file From aefa36c54bc2b56cd906170cc0d25f8dbf4f6e48 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 14 Sep 2021 17:26:15 +0200 Subject: [PATCH 10/18] other task executions go ahead if UnknownHostException happens on a single task --- .../orcid/SparkDownloadOrcidAuthors.java | 18 +++++++++++++++++- .../orcid/SparkDownloadOrcidWorks.java | 16 +++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java index 2b8e42bf6..8f0b3a094 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java @@ -4,8 +4,11 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.FileNotFoundException; +import java.net.InetAddress; +import java.net.UnknownHostException; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.List; import java.util.Optional; import org.apache.commons.io.IOUtils; @@ -18,6 +21,7 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.util.LongAccumulator; @@ -78,6 +82,7 @@ public class SparkDownloadOrcidAuthors { LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503"); LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525"); LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic"); + LongAccumulator unknowHostAcc = spark.sparkContext().longAccumulator("error_unknowHost"); logger.info("Retrieving data from lamda sequence file"); JavaPairRDD lamdaFileRDD = sc @@ -107,7 +112,17 @@ public class SparkDownloadOrcidAuthors { httpGet.addHeader("Accept", "application/vnd.orcid+xml"); httpGet.addHeader("Authorization", String.format("Bearer %s", token)); long startReq = System.currentTimeMillis(); - CloseableHttpResponse response = client.execute(httpGet); + CloseableHttpResponse response = null; + try { + response = client.execute(httpGet); + } catch (UnknownHostException u) { + downloaded.setStatusCode(-1); + unknowHostAcc.add(1); + if (client != null) { + client.close(); + } + return downloaded.toTuple2(); + } long endReq = System.currentTimeMillis(); long reqTime = endReq - startReq; if (reqTime < 1000) { @@ -171,6 +186,7 @@ public class SparkDownloadOrcidAuthors { logger.info("errorHTTP503Acc: {}", errorHTTP503Acc.value()); logger.info("errorHTTP525Acc: {}", errorHTTP525Acc.value()); logger.info("errorHTTPGenericAcc: {}", errorHTTPGenericAcc.value()); + logger.info("unknowHostAcc: {}", unknowHostAcc.value()); }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java index cab538783..457c79adb 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java @@ -3,6 +3,8 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.net.InetAddress; +import java.net.UnknownHostException; import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.*; @@ -96,6 +98,7 @@ public class SparkDownloadOrcidWorks { LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503"); LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525"); LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic"); + LongAccumulator unknowHostAcc = spark.sparkContext().longAccumulator("error_unknowHost"); JavaPairRDD updatedAuthorsRDD = sc .sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class); @@ -154,7 +157,17 @@ public class SparkDownloadOrcidWorks { httpGet.addHeader("Accept", "application/vnd.orcid+xml"); httpGet.addHeader("Authorization", String.format("Bearer %s", token)); long startReq = System.currentTimeMillis(); - CloseableHttpResponse response = client.execute(httpGet); + CloseableHttpResponse response = null; + try { + response = client.execute(httpGet); + } catch (UnknownHostException u) { + downloaded.setStatusCode(-1); + unknowHostAcc.add(1); + if (client != null) { + client.close(); + } + return downloaded.toTuple2(); + } long endReq = System.currentTimeMillis(); long reqTime = endReq - startReq; if (reqTime < 1000) { @@ -219,6 +232,7 @@ public class SparkDownloadOrcidWorks { logger.info("errorHTTP503Acc: {}", errorHTTP503Acc.value()); logger.info("errorHTTP525Acc: {}", errorHTTP525Acc.value()); logger.info("errorHTTPGenericAcc: {}", errorHTTPGenericAcc.value()); + logger.info("unknowHostAcc: {}", unknowHostAcc.value()); }); } From 8b804e7fe1d1602d33bfb388a4e3d99c40acd22c Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 14 Sep 2021 17:30:52 +0200 Subject: [PATCH 11/18] removed unused imports --- .../eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java | 3 --- .../eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java | 1 - 2 files changed, 4 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java index 8f0b3a094..c0aa007e5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java @@ -4,11 +4,9 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.FileNotFoundException; -import java.net.InetAddress; import java.net.UnknownHostException; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.List; import java.util.Optional; import org.apache.commons.io.IOUtils; @@ -21,7 +19,6 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.util.LongAccumulator; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java index 457c79adb..63ba151f6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java @@ -3,7 +3,6 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.net.InetAddress; import java.net.UnknownHostException; import java.time.LocalDate; import java.time.format.DateTimeFormatter; From dd2329849f0cf2455e34321f59e31c8d97235f7c Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 16 Sep 2021 13:50:34 +0300 Subject: [PATCH 12/18] fixed the definition of cc_licence --- .../scripts/step21-createObservatoryDB.sql | 167 ++++++++++++++---- 1 file changed, 131 insertions(+), 36 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index f0e5a8dab..7c344b903 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,44 +1,61 @@ create table TARGET.result_affiliated_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; create table TARGET.result_affiliated_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; create table TARGET.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; create table TARGET.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -47,13 +64,18 @@ left outer join SOURCE.result_datasources rd on rd.id=r.id left outer join SOURCE.datasource d on d.id=rd.datasource left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; create table TARGET.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -62,38 +84,54 @@ left outer join SOURCE.result_datasources rd on rd.id=r.id left outer join SOURCE.datasource d on d.id=rd.datasource left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; create table TARGET.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; create table TARGET.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; create table TARGET.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -102,13 +140,18 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; create table TARGET.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -117,13 +160,18 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; create table TARGET.result_deposited_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -132,12 +180,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; create table TARGET.result_deposited_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -146,13 +200,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; create table TARGET.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -161,13 +220,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; create table TARGET.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -176,13 +240,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; create table TARGET.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -191,12 +260,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; create table TARGET.result_deposited_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, + rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -205,13 +280,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; create table TARGET.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -220,13 +300,18 @@ join SOURCE.organization o on o.id=dor.organization join SOURCE.country c on c.code=o.country and c.continent_name='Europe' left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; create table TARGET.result_deposited_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -237,13 +322,18 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; create table TARGET.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, case when lower(rln.normalized) like 'cc-%' then true else false end as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -254,7 +344,12 @@ join SOURCE.result_projects rp on rp.id=r.id join SOURCE.project p on p.id=rp.project left outer join SOURCE.result_licenses rl on rl.id=r.id left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join SOURCE.licenses_normalized rln on rln.license=rl.type +left outer join ( + select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count + from SOURCE.result_licenses rl + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + group by rl.id +) rln on rln.id=r.id group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; compute stats TARGET.result_affiliated_country; From 2943287d1005ebb004c9068d8b27ef046a935543 Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 16 Sep 2021 15:59:06 +0300 Subject: [PATCH 13/18] fixed the definition of cc_licence, part II --- .../scripts/step21-createObservatoryDB.sql | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 7c344b903..d71978a30 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,7 +1,7 @@ create table TARGET.result_affiliated_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -19,7 +19,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -37,7 +37,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -55,7 +55,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -75,7 +75,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -95,7 +95,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -113,7 +113,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -131,7 +131,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -151,7 +151,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_organization ro on ro.id=r.id join SOURCE.organization o on o.id=ro.organization @@ -171,7 +171,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -191,7 +191,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_year stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -211,7 +211,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -231,7 +231,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -251,7 +251,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -271,7 +271,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_organization stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname + coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -291,7 +291,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -311,7 +311,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_funder stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') @@ -333,7 +333,7 @@ group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, create table TARGET.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, rln.count > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname + r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r join SOURCE.result_datasources rd on rd.id=r.id join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') From 8b681dcf1b99cd7fa0c1046abe52b3b35622bf83 Mon Sep 17 00:00:00 2001 From: antleb Date: Sat, 18 Sep 2021 00:35:14 +0300 Subject: [PATCH 14/18] attempt to make the observatory wf run in hive --- .../oa/graph/stats/oozie_app/observatory.sh | 4 +- .../scripts/step21-createObservatoryDB.sql | 840 +++++++++++------- 2 files changed, 527 insertions(+), 317 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh index ff03bca03..7db8d40a5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh @@ -18,7 +18,9 @@ echo "Creating observatory database" impala-shell -q "drop database if exists ${TARGET} cascade" impala-shell -q "create database if not exists ${TARGET}" impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - -cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | hive -f - +impala-shell -q "invalidate metadata;" +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - echo "Impala shell finished" echo "Updating shadow observatory database" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index d71978a30..f17b5358f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,372 +1,580 @@ -create table TARGET.result_affiliated_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname +create table TARGET.result_cc_licence stored as parquet as +select r.id, coalesce(rln.count, 0) > 0 as cc_licence from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( + left outer join ( select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + left outer join SOURCE.licenses_normalized rln on rl.type=rln.license group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; +) rln on rln.id=r.id; + +create table TARGET.result_affiliated_country stored as parquet as +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + c.code as ccode, c.name as cname +from SOURCE.result r + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; create table TARGET.result_affiliated_year stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; create table TARGET.result_affiliated_year_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; create table TARGET.result_affiliated_datasource stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_datasources rd on rd.id=r.id -left outer join SOURCE.datasource d on d.id=rd.datasource -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_datasources rd on rd.id=r.id + left outer join SOURCE.datasource d on d.id=rd.datasource + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; create table TARGET.result_affiliated_datasource_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_datasources rd on rd.id=r.id -left outer join SOURCE.datasource d on d.id=rd.datasource -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_datasources rd on rd.id=r.id + left outer join SOURCE.datasource d on d.id=rd.datasource + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; create table TARGET.result_affiliated_organization stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; create table TARGET.result_affiliated_organization_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; create table TARGET.result_affiliated_funder stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + join SOURCE.result_projects rp on rp.id=r.id + join SOURCE.project p on p.id=rp.project + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; create table TARGET.result_affiliated_funder_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_organization ro on ro.id=r.id -join SOURCE.organization o on o.id=ro.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; + join SOURCE.result_organization ro on ro.id=r.id + join SOURCE.organization o on o.id=ro.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + join SOURCE.result_projects rp on rp.id=r.id + join SOURCE.project p on p.id=rp.project + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; create table TARGET.result_deposited_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; create table TARGET.result_deposited_year stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; create table TARGET.result_deposited_year_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, r.year, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + r.year, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, r.year, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; create table TARGET.result_deposited_datasource stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; create table TARGET.result_deposited_datasource_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, d.name as dname, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + d.name as dname, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, d.name, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; create table TARGET.result_deposited_organization stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, - coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; create table TARGET.result_deposited_organization_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, o.name as oname, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + o.name as oname, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, o.name, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; create table TARGET.result_deposited_funder stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + join SOURCE.result_projects rp on rp.id=r.id + join SOURCE.project p on p.id=rp.project + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; create table TARGET.result_deposited_funder_country stored as parquet as -select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, - case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, - r.peer_reviewed, coalesce(rln.count, 0) > 0 as cc_licence, r.abstract as abstract, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +select + count(distinct r.id) as total, + r.green, + r.gold, + case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, + rln.cc_licence, + r.abstract as abstract, + r.authors > 1 as multiple_authors, + rpc.count > 1 as multiple_projects, + rfc.count > 1 as multiple_funders, + r.type, + p.funder as pfunder, c.code as ccode, c.name as cname from SOURCE.result r -join SOURCE.result_datasources rd on rd.id=r.id -join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') -join SOURCE.datasource_organizations dor on dor.id=d.id -join SOURCE.organization o on o.id=dor.organization -join SOURCE.country c on c.code=o.country and c.continent_name='Europe' -join SOURCE.result_projects rp on rp.id=r.id -join SOURCE.project p on p.id=rp.project -left outer join SOURCE.result_licenses rl on rl.id=r.id -left outer join SOURCE.result_pids pids on pids.id=r.id -left outer join ( - select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license - group by rl.id -) rln on rln.id=r.id -group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, cc_licence, abstract, p.funder, c.code, c.name; + join SOURCE.result_datasources rd on rd.id=r.id + join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join SOURCE.datasource_organizations dor on dor.id=d.id + join SOURCE.organization o on o.id=dor.organization + join SOURCE.country c on c.code=o.country and c.continent_name='Europe' + join SOURCE.result_projects rp on rp.id=r.id + join SOURCE.project p on p.id=rp.project + left outer join SOURCE.result_licenses rl on rl.id=r.id + left outer join SOURCE.result_pids pids on pids.id=r.id + left outer join SOURCE.result_cc_licence rln on rln.id=r.id + left outer join SOURCE.result_projectcount rpc on rpc.id=r.id + left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, + case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; -compute stats TARGET.result_affiliated_country; -compute stats TARGET.result_affiliated_year; -compute stats TARGET.result_affiliated_year_country; -compute stats TARGET.result_affiliated_datasource; -compute stats TARGET.result_affiliated_datasource_country; -compute stats TARGET.result_affiliated_organization; -compute stats TARGET.result_affiliated_organization_country; -compute stats TARGET.result_affiliated_funder; -compute stats TARGET.result_affiliated_funder_country; -compute stats TARGET.result_deposited_country; -compute stats TARGET.result_deposited_year; -compute stats TARGET.result_deposited_year_country; -compute stats TARGET.result_deposited_datasource; -compute stats TARGET.result_deposited_datasource_country; -compute stats TARGET.result_deposited_organization; -compute stats TARGET.result_deposited_organization_country; -compute stats TARGET.result_deposited_funder; -compute stats TARGET.result_deposited_funder_country; \ No newline at end of file +-- compute stats TARGET.result_affiliated_country; +-- compute stats TARGET.result_affiliated_year; +-- compute stats TARGET.result_affiliated_year_country; +-- compute stats TARGET.result_affiliated_datasource; +-- compute stats TARGET.result_affiliated_datasource_country; +-- compute stats TARGET.result_affiliated_organization; +-- compute stats TARGET.result_affiliated_organization_country; +-- compute stats TARGET.result_affiliated_funder; +-- compute stats TARGET.result_affiliated_funder_country; +-- compute stats TARGET.result_deposited_country; +-- compute stats TARGET.result_deposited_year; +-- compute stats TARGET.result_deposited_year_country; +-- compute stats TARGET.result_deposited_datasource; +-- compute stats TARGET.result_deposited_datasource_country; +-- compute stats TARGET.result_deposited_organization; +-- compute stats TARGET.result_deposited_organization_country; +-- compute stats TARGET.result_deposited_funder; +-- compute stats TARGET.result_deposited_funder_country; \ No newline at end of file From 92a63f78fe29bad5162af1a7964710a8d721ccf7 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 20 Sep 2021 18:25:00 +0200 Subject: [PATCH 15/18] multiple download attempts handling if a connection to orcid server fails --- .../orcid/SparkDownloadOrcidAuthors.java | 103 +++---- .../orcid/SparkDownloadOrcidWorks.java | 97 +++---- .../doiboost/orcid/util/DownloadsReport.java | 10 + .../util/MultiAttemptsHttpConnector.java | 272 ++++++++++++++++++ .../doiboost/orcid/OrcidClientTest.java | 92 +++++- 5 files changed, 439 insertions(+), 135 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/util/DownloadsReport.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/util/MultiAttemptsHttpConnector.java diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java index c0aa007e5..c549beb03 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java @@ -4,7 +4,6 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.FileNotFoundException; -import java.net.UnknownHostException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Optional; @@ -13,10 +12,6 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -26,8 +21,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData; +import eu.dnetlib.doiboost.orcid.util.DownloadsReport; import eu.dnetlib.doiboost.orcid.util.HDFSUtil; +import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector; import scala.Tuple2; public class SparkDownloadOrcidAuthors { @@ -73,17 +72,12 @@ public class SparkDownloadOrcidAuthors { LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records"); LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records"); LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records"); - LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403"); - LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404"); - LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409"); - LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503"); - LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525"); - LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic"); - LongAccumulator unknowHostAcc = spark.sparkContext().longAccumulator("error_unknowHost"); + LongAccumulator errorsAcc = spark.sparkContext().longAccumulator("errors"); - logger.info("Retrieving data from lamda sequence file"); + String lambdaFilePath = workingPath + lambdaFileName; + logger.info("Retrieving data from lamda sequence file: " + lambdaFilePath); JavaPairRDD lamdaFileRDD = sc - .sequenceFile(workingPath + lambdaFileName, Text.class, Text.class); + .sequenceFile(lambdaFilePath, Text.class, Text.class); final long lamdaFileRDDCount = lamdaFileRDD.count(); logger.info("Data retrieved: {}", lamdaFileRDDCount); @@ -104,57 +98,44 @@ public class SparkDownloadOrcidAuthors { final DownloadedRecordData downloaded = new DownloadedRecordData(); downloaded.setOrcidId(orcidId); downloaded.setLastModifiedDate(lastModifiedDate); - CloseableHttpClient client = HttpClients.createDefault(); - HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); - httpGet.addHeader("Accept", "application/vnd.orcid+xml"); - httpGet.addHeader("Authorization", String.format("Bearer %s", token)); + final HttpClientParams clientParams = new HttpClientParams(); + MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams); + httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER); + httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml"); + httpConnector.setAuthToken(token); + String apiUrl = "https://api.orcid.org/v3.0/" + orcidId + "/record"; + DownloadsReport report = new DownloadsReport(); long startReq = System.currentTimeMillis(); - CloseableHttpResponse response = null; + boolean downloadCompleted = false; + String record = ""; try { - response = client.execute(httpGet); - } catch (UnknownHostException u) { - downloaded.setStatusCode(-1); - unknowHostAcc.add(1); - if (client != null) { - client.close(); + record = httpConnector.getInputSource(apiUrl, report); + downloadCompleted = true; + } catch (CollectorException ce) { + if (!report.isEmpty()) { + int errCode = report.keySet().stream().findFirst().get(); + report.forEach((k, v) -> { + logger.error(k + " " + v); + }); + downloaded.setStatusCode(errCode); + } else { + downloaded.setStatusCode(-4); } - return downloaded.toTuple2(); + errorsAcc.add(1); } long endReq = System.currentTimeMillis(); long reqTime = endReq - startReq; if (reqTime < 1000) { Thread.sleep(1000 - reqTime); } - int statusCode = response.getStatusLine().getStatusCode(); - downloaded.setStatusCode(statusCode); - if (statusCode != 200) { - switch (statusCode) { - case 403: - errorHTTP403Acc.add(1); - break; - case 404: - errorHTTP404Acc.add(1); - break; - case 409: - errorHTTP409Acc.add(1); - break; - case 503: - errorHTTP503Acc.add(1); - break; - case 525: - errorHTTP525Acc.add(1); - break; - default: - errorHTTPGenericAcc.add(1); - } - return downloaded.toTuple2(); + if (downloadCompleted) { + downloaded.setStatusCode(200); + downloadedRecordsAcc.add(1); + downloaded + .setCompressedData( + ArgumentApplicationParser + .compressArgument(record)); } - downloadedRecordsAcc.add(1); - downloaded - .setCompressedData( - ArgumentApplicationParser - .compressArgument(IOUtils.toString(response.getEntity().getContent()))); - client.close(); return downloaded.toTuple2(); }; @@ -165,27 +146,17 @@ public class SparkDownloadOrcidAuthors { long authorsModifiedCount = authorsModifiedRDD.count(); logger.info("Authors modified count: {}", authorsModifiedCount); - logger.info("Start downloading ..."); - final JavaPairRDD pairRDD = authorsModifiedRDD .repartition(100) .map(downloadRecordFn) .mapToPair(t -> new Tuple2<>(new Text(t._1()), new Text(t._2()))); - saveAsSequenceFile(workingPath, outputPath, sc, pairRDD); logger.info("parsedRecordsAcc: {}", parsedRecordsAcc.value()); logger.info("modifiedRecordsAcc: {}", modifiedRecordsAcc.value()); logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value()); - logger.info("errorHTTP403Acc: {}", errorHTTP403Acc.value()); - logger.info("errorHTTP404Acc: {}", errorHTTP404Acc.value()); - logger.info("errorHTTP409Acc: {}", errorHTTP409Acc.value()); - logger.info("errorHTTP503Acc: {}", errorHTTP503Acc.value()); - logger.info("errorHTTP525Acc: {}", errorHTTP525Acc.value()); - logger.info("errorHTTPGenericAcc: {}", errorHTTPGenericAcc.value()); - logger.info("unknowHostAcc: {}", unknowHostAcc.value()); + logger.info("errorsAcc: {}", errorsAcc.value()); }); - } private static void saveAsSequenceFile(String workingPath, String outputPath, JavaSparkContext sc, diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java index 63ba151f6..0569bacfd 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidWorks.java @@ -3,7 +3,6 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.net.UnknownHostException; import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.*; @@ -12,10 +11,6 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -29,8 +24,12 @@ import com.google.gson.JsonElement; import com.google.gson.JsonParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData; +import eu.dnetlib.doiboost.orcid.util.DownloadsReport; import eu.dnetlib.doiboost.orcid.util.HDFSUtil; +import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; import scala.Tuple2; @@ -91,13 +90,7 @@ public class SparkDownloadOrcidWorks { .sparkContext() .longAccumulator("error_parsing_xml_found"); LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records"); - LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403"); - LongAccumulator errorHTTP404Acc = spark.sparkContext().longAccumulator("error_HTTP_404"); - LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409"); - LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503"); - LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525"); - LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic"); - LongAccumulator unknowHostAcc = spark.sparkContext().longAccumulator("error_unknowHost"); + LongAccumulator errorsAcc = spark.sparkContext().longAccumulator("errors"); JavaPairRDD updatedAuthorsRDD = sc .sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class); @@ -151,61 +144,44 @@ public class SparkDownloadOrcidWorks { final DownloadedRecordData downloaded = new DownloadedRecordData(); downloaded.setOrcidId(orcidId); downloaded.setLastModifiedDate(lastUpdateValue); - CloseableHttpClient client = HttpClients.createDefault(); - HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + relativeWorkUrl); - httpGet.addHeader("Accept", "application/vnd.orcid+xml"); - httpGet.addHeader("Authorization", String.format("Bearer %s", token)); + final HttpClientParams clientParams = new HttpClientParams(); + MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams); + httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER); + httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml"); + httpConnector.setAuthToken(token); + String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorkUrl; + DownloadsReport report = new DownloadsReport(); long startReq = System.currentTimeMillis(); - CloseableHttpResponse response = null; + boolean downloadCompleted = false; + String record = ""; try { - response = client.execute(httpGet); - } catch (UnknownHostException u) { - downloaded.setStatusCode(-1); - unknowHostAcc.add(1); - if (client != null) { - client.close(); + record = httpConnector.getInputSource(apiUrl, report); + downloadCompleted = true; + } catch (CollectorException ce) { + if (!report.isEmpty()) { + int errCode = report.keySet().stream().findFirst().get(); + report.forEach((k, v) -> { + logger.error(k + " " + v); + }); + downloaded.setStatusCode(errCode); + } else { + downloaded.setStatusCode(-4); } - return downloaded.toTuple2(); + errorsAcc.add(1); } long endReq = System.currentTimeMillis(); long reqTime = endReq - startReq; if (reqTime < 1000) { Thread.sleep(1000 - reqTime); } - int statusCode = response.getStatusLine().getStatusCode(); - downloaded.setStatusCode(statusCode); - if (statusCode != 200) { - switch (statusCode) { - case 403: - errorHTTP403Acc.add(1); - break; - case 404: - errorHTTP404Acc.add(1); - break; - case 409: - errorHTTP409Acc.add(1); - break; - case 503: - errorHTTP503Acc.add(1); - break; - case 525: - errorHTTP525Acc.add(1); - break; - default: - errorHTTPGenericAcc.add(1); - logger - .info( - "Downloading {} status code: {}", orcidId, - response.getStatusLine().getStatusCode()); - } - return downloaded.toTuple2(); + if (downloadCompleted) { + downloaded.setStatusCode(200); + downloadedRecordsAcc.add(1); + downloaded + .setCompressedData( + ArgumentApplicationParser + .compressArgument(record)); } - downloadedRecordsAcc.add(1); - downloaded - .setCompressedData( - ArgumentApplicationParser - .compressArgument(IOUtils.toString(response.getEntity().getContent()))); - client.close(); return downloaded.toTuple2(); }; @@ -226,12 +202,7 @@ public class SparkDownloadOrcidWorks { logger.info("errorLoadingXMLFoundAcc: {}", errorLoadingXMLFoundAcc.value()); logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value()); logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value()); - logger.info("errorHTTP403Acc: {}", errorHTTP403Acc.value()); - logger.info("errorHTTP409Acc: {}", errorHTTP409Acc.value()); - logger.info("errorHTTP503Acc: {}", errorHTTP503Acc.value()); - logger.info("errorHTTP525Acc: {}", errorHTTP525Acc.value()); - logger.info("errorHTTPGenericAcc: {}", errorHTTPGenericAcc.value()); - logger.info("unknowHostAcc: {}", unknowHostAcc.value()); + logger.info("errorsAcc: {}", errorsAcc.value()); }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/util/DownloadsReport.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/util/DownloadsReport.java new file mode 100644 index 000000000..b06b0af90 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/util/DownloadsReport.java @@ -0,0 +1,10 @@ + +package eu.dnetlib.doiboost.orcid.util; + +import java.util.LinkedHashMap; + +public class DownloadsReport extends LinkedHashMap { + + public DownloadsReport() { + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/util/MultiAttemptsHttpConnector.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/util/MultiAttemptsHttpConnector.java new file mode 100644 index 000000000..5ef6efa26 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/util/MultiAttemptsHttpConnector.java @@ -0,0 +1,272 @@ + +package eu.dnetlib.doiboost.orcid.util; + +import static eu.dnetlib.dhp.utils.DHPUtils.MAPPER; + +import java.io.IOException; +import java.io.InputStream; +import java.net.*; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.http.HttpHeaders; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.common.collection.HttpClientParams; + +/** + * Derived from eu.dnetlib.dhp.common.collection.HttpConnector2 with custom report and Bearer auth + * + * @author enrico + */ +public class MultiAttemptsHttpConnector { + + private static final Logger log = LoggerFactory.getLogger(MultiAttemptsHttpConnector.class); + + private HttpClientParams clientParams; + + private String responseType = null; + + private static final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + + private String authToken = ""; + private String acceptHeaderValue = ""; + private String authMethod = ""; + public final static String BEARER = "BEARER"; + + public MultiAttemptsHttpConnector() { + this(new HttpClientParams()); + } + + public MultiAttemptsHttpConnector(HttpClientParams clientParams) { + this.clientParams = clientParams; + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } + + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl the URL + * @param report the list of errors + * @return the content of the downloaded resource + * @throws CollectorException when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl, DownloadsReport report) + throws CollectorException { + return attemptDownloadAsString(requestUrl, 1, report); + } + + private String attemptDownloadAsString(final String requestUrl, final int retryNumber, + final DownloadsReport report) throws CollectorException { + + try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) { + return IOUtils.toString(s); + } catch (IOException e) { + log.error(e.getMessage(), e); + throw new CollectorException(e); + } + } + + private InputStream attemptDownload(final String requestUrl, final int retryNumber, + final DownloadsReport report) throws CollectorException, IOException { + + if (retryNumber > getClientParams().getMaxNumberOfRetry()) { + final String msg = String + .format( + "Max number of retries (%s/%s) exceeded, failing.", + retryNumber, getClientParams().getMaxNumberOfRetry()); + log.error(msg); + throw new CollectorException(msg); + } + + log.info("Request attempt {} [{}]", retryNumber, requestUrl); + + InputStream input = null; + + try { + if (getClientParams().getRequestDelay() > 0) { + backoffAndSleep(getClientParams().getRequestDelay()); + } + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000); + urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000); + urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent); + + if (!getAcceptHeaderValue().isEmpty()) { + urlConn.addRequestProperty(HttpHeaders.ACCEPT, getAcceptHeaderValue()); + } + if (!getAuthToken().isEmpty() && getAuthMethod().equals(BEARER)) { + urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken())); + } + + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } + + int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (is2xx(urlConn.getResponseCode())) { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + if (is3xx(urlConn.getResponseCode())) { + // REDIRECTS + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.info("The requested url has been moved to {}", newUrl); + report + .put( + urlConn.getResponseCode(), + String.format("Moved to: %s", newUrl)); + urlConn.disconnect(); + if (retryAfter > 0) { + backoffAndSleep(retryAfter); + } + return attemptDownload(newUrl, retryNumber + 1, report); + } + if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) { + switch (urlConn.getResponseCode()) { + case HttpURLConnection.HTTP_NOT_FOUND: + case HttpURLConnection.HTTP_BAD_GATEWAY: + case HttpURLConnection.HTTP_UNAVAILABLE: + case HttpURLConnection.HTTP_GATEWAY_TIMEOUT: + if (retryAfter > 0) { + log + .warn( + "{} - waiting and repeating request after suggested retry-after {} sec.", + requestUrl, retryAfter); + backoffAndSleep(retryAfter * 1000); + } else { + log + .warn( + "{} - waiting and repeating request after default delay of {} sec.", + requestUrl, getClientParams().getRetryDelay()); + backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000); + } + report.put(urlConn.getResponseCode(), requestUrl); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, report); + default: + report + .put( + urlConn.getResponseCode(), + String + .format( + "%s Error: %s", requestUrl, urlConn.getResponseMessage())); + throw new CollectorException(urlConn.getResponseCode() + " error " + report); + } + } + throw new CollectorException( + String + .format( + "Unexpected status code: %s errors: %s", urlConn.getResponseCode(), + MAPPER.writeValueAsString(report))); + } catch (MalformedURLException | UnknownHostException e) { + log.error(e.getMessage(), e); + report.put(-2, e.getMessage()); + throw new CollectorException(e.getMessage(), e); + } catch (SocketTimeoutException | SocketException e) { + log.error(e.getMessage(), e); + report.put(-3, e.getMessage()); + backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000); + return attemptDownload(requestUrl, retryNumber + 1, report); + } + } + + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: {}", urlConn.getResponseMessage()); + + for (Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (String v : e.getValue()) { + log.debug(" key: {} - value: {}", e.getKey(), v); + } + } + } + } + + private void backoffAndSleep(int sleepTimeMs) throws CollectorException { + log.info("I'm going to sleep for {}ms", sleepTimeMs); + try { + Thread.sleep(sleepTimeMs); + } catch (InterruptedException e) { + log.error(e.getMessage(), e); + throw new CollectorException(e); + } + } + + private int obtainRetryAfter(final Map> headerMap) { + for (String key : headerMap.keySet()) { + if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (!headerMap.get(key).isEmpty()) + && NumberUtils.isCreatable(headerMap.get(key).get(0))) { + return Integer.parseInt(headerMap.get(key).get(0)) + 10; + } + } + return -1; + } + + private String obtainNewLocation(final Map> headerMap) throws CollectorException { + for (String key : headerMap.keySet()) { + if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) { + return headerMap.get(key).get(0); + } + } + throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING"); + } + + private boolean is2xx(final int statusCode) { + return statusCode >= 200 && statusCode <= 299; + } + + private boolean is4xx(final int statusCode) { + return statusCode >= 400 && statusCode <= 499; + } + + private boolean is3xx(final int statusCode) { + return statusCode >= 300 && statusCode <= 399; + } + + private boolean is5xx(final int statusCode) { + return statusCode >= 500 && statusCode <= 599; + } + + public String getResponseType() { + return responseType; + } + + public HttpClientParams getClientParams() { + return clientParams; + } + + public void setClientParams(HttpClientParams clientParams) { + this.clientParams = clientParams; + } + + public void setAuthToken(String authToken) { + this.authToken = authToken; + } + + private String getAuthToken() { + return authToken; + } + + public String getAcceptHeaderValue() { + return acceptHeaderValue; + } + + public void setAcceptHeaderValue(String acceptHeaderValue) { + this.acceptHeaderValue = acceptHeaderValue; + } + + public String getAuthMethod() { + return authMethod; + } + + public void setAuthMethod(String authMethod) { + this.authMethod = authMethod; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index 9ea7c6959..d4079058e 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -1,13 +1,11 @@ package eu.dnetlib.doiboost.orcid; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.*; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.text.ParseException; import java.text.SimpleDateFormat; @@ -17,7 +15,6 @@ import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.compress.utils.Lists; -import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; @@ -28,8 +25,11 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.schema.orcid.AuthorData; -import eu.dnetlib.doiboost.orcid.xml.XMLRecordParserTest; +import eu.dnetlib.doiboost.orcid.util.DownloadsReport; +import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector; import jdk.nashorn.internal.ir.annotations.Ignore; public class OrcidClientTest { @@ -49,7 +49,7 @@ public class OrcidClientTest { @BeforeAll private static void setUp() throws IOException { - testPath = Files.createTempDirectory(XMLRecordParserTest.class.getName()); + testPath = Files.createTempDirectory(OrcidClientTest.class.getName()); System.out.println("using test path: " + testPath); } @@ -349,4 +349,84 @@ public class OrcidClientTest { final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork); logToFile(testPath, "\n\nwork updated \n\n" + work); } + + @Test + void downloadUnknownHostExceptionTest() throws Exception { + logToFile(testPath, "downloadUnknownHostExceptionTest"); + final String orcid = "0000-0001-7291-3210"; + final HttpClientParams clientParams = new HttpClientParams(); + clientParams.setMaxNumberOfRetry(2); + MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams); + httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER); + httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml"); + httpConnector.setAuthToken("78fdb232-7105-4086-8570-e153f4198e3d"); + String wrongApiUrl = "https://api.orcid_UNKNOWN.org/v3.0/" + orcid + "/" + REQUEST_TYPE_RECORD; + String url = "UNKNOWN"; + DownloadsReport report = new DownloadsReport(); + try { + httpConnector.getInputSource(wrongApiUrl, report); + } catch (CollectorException ce) { + logToFile(testPath, "CollectorException downloading: " + ce.getMessage()); + } catch (Throwable t) { + logToFile(testPath, "Throwable downloading: " + t.getMessage()); + } + } + + @Test + void downloadAttemptSuccessTest() throws Exception { + logToFile(testPath, "downloadAttemptSuccessTest"); + final String orcid = "0000-0001-7291-3210"; + final HttpClientParams clientParams = new HttpClientParams(); + clientParams.setMaxNumberOfRetry(2); + MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams); + httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER); + httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml"); + httpConnector.setAuthToken("78fdb232-7105-4086-8570-e153f4198e3d"); + String apiUrl = "https://api.orcid.org/v3.0/" + orcid + "/" + REQUEST_TYPE_RECORD; + String url = "UNKNOWN"; + DownloadsReport report = new DownloadsReport(); + String record = httpConnector.getInputSource(apiUrl, report); + logToFile(testPath, "Downloaded at first attempt record: " + record); + } + + @Test + void downloadAttemptNotFoundTest() throws Exception { + logToFile(testPath, "downloadAttemptNotFoundTest"); + final HttpClientParams clientParams = new HttpClientParams(); + clientParams.setMaxNumberOfRetry(2); + MultiAttemptsHttpConnector httpConnector = new MultiAttemptsHttpConnector(clientParams); + httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER); + httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml"); + httpConnector.setAuthToken("78fdb232-7105-4086-8570-e153f4198e3d"); + String apiUrl = "https://api.orcid.org/v3.0/NOTFOUND/" + REQUEST_TYPE_RECORD; + DownloadsReport report = new DownloadsReport(); + try { + httpConnector.getInputSource(apiUrl, report); + } catch (CollectorException ce) { + + } + report.forEach((k, v) -> { + try { + logToFile(testPath, k + " " + v); + } catch (IOException e) { + e.printStackTrace(); + } + }); + } + + @Test + @Ignore + void testDownloadedAuthor() throws Exception { + final String base64CompressedWork = "H4sIAAAAAAAAAI2Yy26jMBSG932KiD0hIe1MiwiVZjGLkWbX2XRHsFOsgs3YJmnefszFFy4+mUhtVPz9P/gcH/vQ9PWrrjYXzAVh9Bjst7tgg2nBEKEfx+DP28/wOdgImVOUV4ziY3DDInjNHlKOC8ZRMnxtmlyWxyDaqU+ofg7h/uX7IYwfn+Ngo25ARUKoxJzm1TEopWySKLper1vGC4LU74+IikgTWoFRW+SyfyyfxCBag4iQhBawyoGMDjdqJrnECJAZRquYLDEPaV5jv8oyWlXj+qTiXZLGr7KMiQbnjAOR6IY1W7C6hgIwjGt6SKGfHsY13ajHYipLIcIyJ5Xw6+akdvjEtyt4wxEwM6+VGph5N2zYr2ENhQRhKsmZYChmS1j7nFs6VIBPOwImKhyfMVeFg6GAWEjrcoQ4FoBmBGwVXYhagGHDBIEX+ZzUDiqyn35VN6rJUpUJ4zc/PAI2T03FbrUKJZQszWjV3zavVOjvVfoE01qB+YUUQPGNwHTt3luxJjdqh1AxJFBKLWOrSeCcF13RtxxYtlPOPqH6m+MLwVfoMQ2kdae2ArLajc6fTxkI1nIoegs0yB426pMO+0fSw07xDKMu0XKSde5C2VvrlVMijRzFwqY7XTJI1QMLWcmEzMxtDdxfHiYSgTNJnYJ1K9y5k0tUrMgrnGGaRiuXxxuClulYUbr0nBvpkYLjvgTCGsuSoex3f1CEvRPHKI184NJKtKeaiO7cD5E61bJ4F+9DFd7d01u8Tw6H5BBvvz8f3q3nXLGIeJULGdaqeVBBRK7rS7h/fNvvk/gpedxt4923dxP7Fc3KtKuc1BhlkrfYmeN4dcmrhmbw60+HmWw2CKgbTuqc32CXKTTmeTWT6bDBjPsQ0DTpnchdaYO0ayQ2FyLIiVREqs25aU8VKYLRbK0BsyZuqvr1MU2Sm/rDdhe/2CRN6FU/b+oBVyj1zqRtC5F8kAumfTclsl+s7EoNQu64nfOaVLeezX60Z3XCULLi6GI2IZGTEeey7fec9lBAuXawIHKcpifE7GABHWfoxLVfpUNPBXoMbZWrHFsR3bPAk9J9i2sw9nW6AQT1mpk++7JhW+v44Hmt8PomJqfD13jRnvFOSxCKtu6qHoyBbQ7cMFo750UEfGaXm6bEeplXIXj2hvL6mA7tzvIwmM9pbJFBG834POZdLGi2gH2u9u0K9HMwn5PTioFWLufzmrS4oNuU9Pkt2rf/2jMs7fMdm2rQTTM+j+49AzToAVuXYA1mD2k0+XdE9vAP+JYR5NcQAAA="; + final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork); + logToFile(testPath, "\n\ndownloaded author \n\n" + work); + } + + @Test + @Ignore + void testDownloadedWork() throws Exception { + final String base64CompressedWork = "H4sIAAAAAAAAANVa63LiOBb+z1Oo+LVbhbkGAlTCLE1Id9IhTQV6unr/CVvB2tiWR5Khmal5rX2BfbE9ki3b3Jzt6Y13h6pQSPrOXTo6knL10zffQxvCBWXBdbVVb1YRCWzm0GB9Xf28vLX6VSQkDhzssYBcV3dEVH8aVa62jL8M1RcKI2kBAYwNLnrtXrMPFCGW7nW10YSPBX8dq3XRb1swNGgomkaG3FBBV9SjcnddDaOVR+0qApUCMaSBJDzA3nXVlTIcNhrb7bbOuE0d+F43AtEwCENBnMjGUhtyjiSFGBqHCkkDu5gqB0rpSMgJsCJOAVmKMVRMuoRbAfbJeaoMY6h84q8gQi4Nz1NlmNQbnDNe4Ak1bLA28/0iB8TjBg1GMV5gdzxu0CGoxSBKlkMkpp44T3eINBxeyG5bKDABpJb7QF1guRpOsd/iOWRRhwSSPlNS5LNjsOHzHAXxmjlHmwBSr3DyTDgsNVLkkAxk6LDjcCIKaBJAtoo2FCagFTJBiyf5IdJwUAv2PJUaNUgXlgnju/PgBJDFKfTYzgdXFgXLYAzVLxH2wPWvrfQ9mKEVhG+oXbD4EsD+3H1txqaxgQwBPqRFIc0w2WoSBHNbLfqIF0zbfVymIbQ52VCyLVIzBRm6VeQVRFWNHuoHDASLeJH3jqDVUQXB5yrOH0ObE5UNLQe+R+1mu2U1u1Z7sGy2hq3esN2tt5oXf79qnELv8fGwkJYPmxSswD1uA6vVXrY7w+5g2G3WuxedjNsJmj2escJx33G/ZXsU5iAs/AyRR0WcjpRXBLglc0lM1BjP59bX1qw9Hn/+dH87/dy9vBikeinKkyzVHjoqJNWIk7QuE3KU6pES6O7MwsarJh44QW1KowcWOCxAC9tlzEPsGX3YrYGQICgS0JKzENach2bEoTYNyKEQzaJyQnzSqesKSaV3IhRx92L8tLAm7GerjbZUujSwlFnIobqKkTuth+Q4ED4Vqqypp5JyfK8ah5Ji0f8AZVSGT2TZVGXfBLw/liOyqdRpJqfyXr8ldyEZrehKkm8Jr/2hc3Qb7EVk9DfMJbU98pu3k+6aETXXBebCZpt23tBaBUfSZRxdo98eYmgNfRxrh3zAnldDM/37FvZ+IiWtoQfddgiaEGBIDGCG7btA7jgBP9svAK2h90l4yYqIGop5jgMHXA4J0NB9ksR+YTX0qFtfqACO01jGjDHFPx552AW2W0P3uvGROk4NLfTvCeNS8X9MaDg1rL9Qz6PYh7En3f4ZNmKS6nUfQYFmE6PYe05IYBqPFGaq5wHlYpaoDbYqxokVK+JBerz51z+BIzc+SfSdTHVrTiSYtZzGFNOdGrr5ohsLF2+NUguqppkDoua6/S6yXwAYu44pM+/HiZ1BwEDWMqYbC5fjZ+MEBwMjb4PRLdTFYWrUwiUhJH/H+G3pMl/7fjqJhTGwSwU5lnfLsVDmxIPvmRetbJeCOsvfaxWXbXWxLVziqNky51BLW1OP2JKzgNoASSa7Gk1WAfrLI9mirzBBIUD1r/W/AgrMla7CjEMOzYBJolo30/mnxd0SzadPt5+eZtMb9O7rEN1wNINgEA8Ha+IxNMdrHLCQRR4TFRCudnmB7m6GqD0YDCqW+lQqlfnndw93iw/TJ/RwN5k+TqZDNJkAQyUvUlWvktjrdgbQEeI1EapN8Grd7MOeYJlfajSxWVOMfcIhVQXgfcFsqhcceobVA/U3GjsbDCYrjVSKSz0wHo8Xym6dArRvvjsbAfUGouFr8s5lG9o72DVVSy1saDqMqlarWW+12r2GiIXXMzuAU6AQcLLqWf3mZRf6iOlsNQdda9BudhQnvNNdPWN8XA7BgU5G2k3pLADA75XD3BSnn3y+3M90SbZWGczkxiRVmfSaJrd0V8u0yG3CeYRyht7O07Ste45weuqNmhcpLO44woEPRq1eilLN/f3ntEqGPFfzi2PmudHTO3EOEKf60LdTyUeDr7KIIzKfTfqtdr896JxklQtbES/IQD7UyL+SZIJSXYhLHkHZ9oqEjPR1MRzWu550cDYdCeI9n+S4hzouUU76+UeCQJ0fjkKn0+v3m703i0Eh/z97BCDH/XAAziTIt4rH94j7s4dHbSY/HJ90e3qriBQL+MMxCGETs9j/QxiSQ5PaS63/QsZqdS8vOxdvtj7Oc//fL4dTI2LvDAfVA6erSDKe3+cPxw70j4c5HHZlfLT9iAEZYKjZkxOYKZxymJy659l/t+QZllC5bvVJrzShD5GN0/NkiaZyqNcJh0NrdngtTfp7wviaHB+SS1Ng7O+Sk3h5HodT4S8RyY78pUmGM6eEg1l8tVCa1KnvY/SgrzDKsxRLF46j+uahNKH3BE6lsIb1lUxpUhdS3WUE+u6nPP/qiyAsklumMhMz9SBNqeus0oQ+QXqwIa7m3qy87IhXnBLPI8kVXXlZMaASm5vAEqWuKYkvHMtbPdiPiIdm6dVmeVMZjX+lfnKDWmaRAT7ev6ctTfhEF3RoWnJeXlKfSXcHcsf69rk0wTd4Qx30RV9yl5et2Ipwqe/SS5MJXiU8vbIv2b/qZaC8PZ65AUwj9QJR3vx1mQ9b7VPy1FFebnSpWq7xi0qJuwA+fLYpL7rwJdLXobcSa97kM4Cl35f3YXmofp0+8R9gBc/XeXL9Vn38pH7mLTs27z9T8ky1n7ynlZ0I4le78rYzl6t/woG5krwQlpcRcLDD2UPkH5F73C9G5tFKfY0q/wa1TIHI0CgAAA=="; + final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork); + logToFile(testPath, "\n\ndownloaded work \n\n" + work); + } } From 421d55265d7c567092bbf418cdce6b818a9dbd77 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 21 Sep 2021 03:07:58 +0300 Subject: [PATCH 16/18] created hive action for observatory queries --- .../{observatory.sh => observatory-post.sh} | 9 - .../graph/stats/oozie_app/observatory-pre.sh | 16 + .../scripts/step21-createObservatoryDB.sql | 449 +++++++++--------- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 30 +- 4 files changed, 258 insertions(+), 246 deletions(-) rename dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/{observatory.sh => observatory-post.sh} (63%) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh similarity index 63% rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index 7db8d40a5..db8d39af2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -9,16 +9,7 @@ fi export SOURCE=$1 export TARGET=$2 export SHADOW=$3 -export SCRIPT_PATH=$4 -echo "Getting file from " $4 -hdfs dfs -copyToLocal $4 - -echo "Creating observatory database" -impala-shell -q "drop database if exists ${TARGET} cascade" -impala-shell -q "create database if not exists ${TARGET}" -impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - -cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | hive -f - impala-shell -q "invalidate metadata;" impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - echo "Impala shell finished" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh new file mode 100644 index 000000000..92543b8b8 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -0,0 +1,16 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 + +echo "Creating observatory database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index f17b5358f..e0bdcd685 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,14 +1,14 @@ -create table TARGET.result_cc_licence stored as parquet as +create table ${observatory_db_name}.result_cc_licence stored as parquet as select r.id, coalesce(rln.count, 0) > 0 as cc_licence -from SOURCE.result r +from ${stats_db_name}.result r left outer join ( select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count - from SOURCE.result_licenses rl - left outer join SOURCE.licenses_normalized rln on rl.type=rln.license + from ${stats_db_name}.result_licenses rl + left outer join ${stats_db_name}.licenses_normalized rln on rl.type=rln.license group by rl.id ) rln on rln.id=r.id; -create table TARGET.result_affiliated_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -24,20 +24,20 @@ select rfc.count > 1 as multiple_funders, r.type, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; -create table TARGET.result_affiliated_year stored as parquet as +create table ${observatory_db_name}.result_affiliated_year stored as parquet as select count(distinct r.id) as total, r.green, @@ -53,20 +53,20 @@ select rfc.count > 1 as multiple_funders, r.type, r.year -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; -create table TARGET.result_affiliated_year_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -82,20 +82,20 @@ select rfc.count > 1 as multiple_funders, r.type, r.year, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; -create table TARGET.result_affiliated_datasource stored as parquet as +create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, r.green, @@ -111,22 +111,22 @@ select rfc.count > 1 as multiple_funders, r.type, d.name as dname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_datasources rd on rd.id=r.id - left outer join SOURCE.datasource d on d.id=rd.datasource - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id + left outer join ${stats_db_name}.datasource d on d.id=rd.datasource + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; -create table TARGET.result_affiliated_datasource_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -142,22 +142,22 @@ select rfc.count > 1 as multiple_funders, r.type, d.name as dname, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_datasources rd on rd.id=r.id - left outer join SOURCE.datasource d on d.id=rd.datasource - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id + left outer join ${stats_db_name}.datasource d on d.id=rd.datasource + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; -create table TARGET.result_affiliated_organization stored as parquet as +create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, r.green, @@ -173,20 +173,20 @@ select rfc.count > 1 as multiple_funders, r.type, o.name as oname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; -create table TARGET.result_affiliated_organization_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -202,20 +202,20 @@ select rfc.count > 1 as multiple_funders, r.type, o.name as oname, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; -create table TARGET.result_affiliated_funder stored as parquet as +create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, r.green, @@ -231,22 +231,22 @@ select rfc.count > 1 as multiple_funders, r.type, p.funder as pfunder -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - join SOURCE.result_projects rp on rp.id=r.id - join SOURCE.project p on p.id=rp.project - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; -create table TARGET.result_affiliated_funder_country stored as parquet as +create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -262,22 +262,22 @@ select rfc.count > 1 as multiple_funders, r.type, p.funder as pfunder, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_organization ro on ro.id=r.id - join SOURCE.organization o on o.id=ro.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - join SOURCE.result_projects rp on rp.id=r.id - join SOURCE.project p on p.id=rp.project - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_organization ro on ro.id=r.id + join ${stats_db_name}.organization o on o.id=ro.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; -create table TARGET.result_deposited_country stored as parquet as +create table ${observatory_db_name}.result_deposited_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -293,22 +293,22 @@ select rfc.count > 1 as multiple_funders, r.type, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; -create table TARGET.result_deposited_year stored as parquet as +create table ${observatory_db_name}.result_deposited_year stored as parquet as select count(distinct r.id) as total, r.green, @@ -324,22 +324,22 @@ select rfc.count > 1 as multiple_funders, r.type, r.year -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; -create table TARGET.result_deposited_year_country stored as parquet as +create table ${observatory_db_name}.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -355,22 +355,22 @@ select rfc.count > 1 as multiple_funders, r.type, r.year, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; -create table TARGET.result_deposited_datasource stored as parquet as +create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, r.green, @@ -386,22 +386,22 @@ select rfc.count > 1 as multiple_funders, r.type, d.name as dname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; -create table TARGET.result_deposited_datasource_country stored as parquet as +create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -417,22 +417,22 @@ select rfc.count > 1 as multiple_funders, r.type, d.name as dname, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; -create table TARGET.result_deposited_organization stored as parquet as +create table ${observatory_db_name}.result_deposited_organization stored as parquet as select count(distinct r.id) as total, r.green, @@ -448,22 +448,22 @@ select rfc.count > 1 as multiple_funders, r.type, o.name as oname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; -create table TARGET.result_deposited_organization_country stored as parquet as +create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -479,22 +479,22 @@ select rfc.count > 1 as multiple_funders, r.type, o.name as oname, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; -create table TARGET.result_deposited_funder stored as parquet as +create table ${observatory_db_name}.result_deposited_funder stored as parquet as select count(distinct r.id) as total, r.green, @@ -510,24 +510,24 @@ select rfc.count > 1 as multiple_funders, r.type, p.funder as pfunder -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - join SOURCE.result_projects rp on rp.id=r.id - join SOURCE.project p on p.id=rp.project - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; -create table TARGET.result_deposited_funder_country stored as parquet as +create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, r.green, @@ -543,38 +543,19 @@ select rfc.count > 1 as multiple_funders, r.type, p.funder as pfunder, c.code as ccode, c.name as cname -from SOURCE.result r - join SOURCE.result_datasources rd on rd.id=r.id - join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') - join SOURCE.datasource_organizations dor on dor.id=d.id - join SOURCE.organization o on o.id=dor.organization - join SOURCE.country c on c.code=o.country and c.continent_name='Europe' - join SOURCE.result_projects rp on rp.id=r.id - join SOURCE.project p on p.id=rp.project - left outer join SOURCE.result_licenses rl on rl.id=r.id - left outer join SOURCE.result_pids pids on pids.id=r.id - left outer join SOURCE.result_cc_licence rln on rln.id=r.id - left outer join SOURCE.result_projectcount rpc on rpc.id=r.id - left outer join SOURCE.result_fundercount rfc on rfc.id=r.id +from ${stats_db_name}.result r + join ${stats_db_name}.result_datasources rd on rd.id=r.id + join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') + join ${stats_db_name}.datasource_organizations dor on dor.id=d.id + join ${stats_db_name}.organization o on o.id=dor.organization + join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' + join ${stats_db_name}.result_projects rp on rp.id=r.id + join ${stats_db_name}.project p on p.id=rp.project + left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id + left outer join ${stats_db_name}.result_pids pids on pids.id=r.id + left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id + left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; - --- compute stats TARGET.result_affiliated_country; --- compute stats TARGET.result_affiliated_year; --- compute stats TARGET.result_affiliated_year_country; --- compute stats TARGET.result_affiliated_datasource; --- compute stats TARGET.result_affiliated_datasource_country; --- compute stats TARGET.result_affiliated_organization; --- compute stats TARGET.result_affiliated_organization_country; --- compute stats TARGET.result_affiliated_funder; --- compute stats TARGET.result_affiliated_funder_country; --- compute stats TARGET.result_deposited_country; --- compute stats TARGET.result_deposited_year; --- compute stats TARGET.result_deposited_year_country; --- compute stats TARGET.result_deposited_datasource; --- compute stats TARGET.result_deposited_datasource_country; --- compute stats TARGET.result_deposited_organization; --- compute stats TARGET.result_deposited_organization_country; --- compute stats TARGET.result_deposited_funder; --- compute stats TARGET.result_deposited_funder_country; \ No newline at end of file + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 8fe05a933..08d33f4e8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -326,20 +326,44 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh + + + + + + + ${jobTracker} + ${nameNode} + observatory-pre.sh + ${stats_db_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + observatory-pre.sh + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + observatory_db_name=${observatory_db_name} + + + + + + ${jobTracker} ${nameNode} - observatory.sh + observatory-post.sh ${stats_db_name} ${observatory_db_name} ${observatory_db_shadow_name} - ${wf:appPath()}/scripts/step21-createObservatoryDB.sql - observatory.sh + observatory-post.sh From f358cabb2bc63be8d31a069afbe476465a4c1038 Mon Sep 17 00:00:00 2001 From: antleb Date: Wed, 22 Sep 2021 21:50:37 +0300 Subject: [PATCH 17/18] fixed typo --- .../scripts/step21-createObservatoryDB.sql | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index e0bdcd685..e24370e7d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -30,7 +30,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -59,7 +59,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -88,7 +88,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -119,7 +119,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.datasource d on d.id=rd.datasource left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -150,7 +150,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.datasource d on d.id=rd.datasource left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -179,7 +179,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -208,7 +208,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -239,7 +239,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -270,7 +270,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -301,7 +301,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -332,7 +332,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -363,7 +363,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -394,7 +394,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -425,7 +425,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -456,7 +456,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -487,7 +487,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe' left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -520,7 +520,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, @@ -553,7 +553,7 @@ from ${stats_db_name}.result r join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id left outer join ${stats_db_name}.result_pids pids on pids.id=r.id - left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id + left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, From b924276e18b8b5a438251b89998471c1b800eb0c Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Fri, 24 Sep 2021 17:11:56 +0200 Subject: [PATCH 18/18] tests to generate records for the EOSC-Future demo with the EOSC Jupyter Notebbok subject --- .../provision/IndexRecordTransformerTest.java | 12 ++++ .../eosc-future/data-transfer-pilot.xml | 72 +++++++++++++++++++ .../training-notebooks-seadatanet.xml | 71 ++++++++++++++++++ .../eu/dnetlib/dhp/oa/provision/fields.xml | 16 +++-- 4 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/data-transfer-pilot.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/training-notebooks-seadatanet.xml diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index cd07cfcb1..8daf318be 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -76,6 +76,18 @@ public class IndexRecordTransformerTest { testRecordTransformation(record); } + @Test + public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException { + final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml")); + testRecordTransformation(record); + } + + @Test + public void testForEOSCFutureTraining() throws IOException, TransformerException { + final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/training-notebooks-seadatanet.xml")); + testRecordTransformation(record); + } + private void testRecordTransformation(final String record) throws IOException, TransformerException { final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml")); final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")); diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/data-transfer-pilot.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/data-transfer-pilot.xml new file mode 100644 index 000000000..23dd6c6ed --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/data-transfer-pilot.xml @@ -0,0 +1,72 @@ + + +
+ r37b0ad08687::dec0d8520e726f2adda9a51280ac7299 + 2021-09-22T08:53:16Z + under curation + +
+ + + + EGI-Foundation/data-transfer-pilot: Include libraries in environment.yml + Giuseppe La Rocca + Enol Fernández + Andrea Manzi + + + + This notebook is used to demonstrate how a scientist from one of the PaNOSC RIs can use the resources provided by EGI to perform analysis on the data sets obtained during an expirement. + + EOSC Jupyter Notebook + + Zenodo + + + + + + + + + + + + + + + + + + + oai:zenodo.org:4218562 + + oai:zenodo.org:4218562 + 10.5281/zenodo.4218562 + + + false + false + 0.9 + + + + + + + + + + + + + https://zenodo.org/record/4218562 + + + + + + +
+
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/training-notebooks-seadatanet.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/training-notebooks-seadatanet.xml new file mode 100644 index 000000000..9995b902f --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/training-notebooks-seadatanet.xml @@ -0,0 +1,71 @@ + + +
+ r37b0ad08687::eb430fb7438e1533ba95d6aa50a477eb + 2021-09-22T08:53:13Z + under curation + +
+ + + + + EGI-Foundation/training-notebooks-seadatanet: Version 0.4 + Enol Fernández + + + + A sample notebook using SeaDataNet data to plot a map that shows surface temperature of Black Sea, Arctic Sea and Baltic Sea. The data is available at EGI DataHub with PID http://hdl.handle.net/21.T15999/qVk6JWQ (run at EGI Notebooks service for easy access to data).This release updates the PID for the data. + + EOSC Jupyter Notebook + + Zenodo + + + + + + + + + + + + + + + + + + + oai:zenodo.org:3561323 + + oai:zenodo.org:3561323 + 10.5281/zenodo.3561323 + + + false + false + 0.9 + + + + + + + + + + + + + https://zenodo.org/record/3561323 + + + + + + +
+
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml index c47975c9d..910a366f6 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml @@ -15,7 +15,13 @@ - + + + + + + + @@ -28,7 +34,8 @@ - + + @@ -79,6 +86,7 @@ + @@ -105,7 +113,7 @@ - + @@ -130,7 +138,7 @@ - +