From 3bf3127251477ea0a1ea5a3d422c55403aaf28b5 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 14 Sep 2022 16:36:19 +0300 Subject: [PATCH 1/2] Changes to monitor and indicator scripts --- .../oozie_app/scripts/step16-createIndicatorsTables.sql | 6 +++--- .../stats/oozie_app/scripts/step16_1-definitions.sql | 6 +++--- .../stats/oozie_app/scripts/step20-createMonitorDB.sql | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 0f294145c..b8792ae61 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -76,11 +76,11 @@ compute stats indi_result_with_orcid; ---- Sprint 3 ---- create table indi_funded_result_with_fundref stored as parquet as -select distinct r.id, coalesce(fundref, 0) as fundref +select distinct r.result as id, coalesce(fundref, 0) as fundref from project_results r - left outer join (select distinct id, 1 as fundref from project_results + left outer join (select distinct result, 1 as fundref from project_results where provenance='Harvested') tmp - on r.id= tmp.id; + on r.result= tmp.result; compute stats indi_funded_result_with_fundref; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 88c1ece78..41c95758c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -3,20 +3,20 @@ ---------------------------------------------------- -- Peer reviewed: -create table ${stats_db_name}.result_peerreviewed STORED AS PARQUET as +create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; -- Green OA: -create table ${stats_db_name}.result_greenoa STORED AS PARQUET as +create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as select r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; -- GOLD OA: -create table ${stats_db_name}.result_gold STORED AS PARQUET as +create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as select r.id, case when gold.is_gold=1 then true else false end as gold from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 572b70ef7..5af2e9edd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -162,10 +162,10 @@ create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * fro compute stats TARGET.indi_pub_doi_from_crossref; create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_gold_oa; -create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_datasets_gold_oa; -create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_software_gold_oa; +--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_datasets_gold_oa; +--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_software_gold_oa; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_has_abstract; create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); From dcd85f8cd79ab8342e03eca6ef9530f35dd242ab Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 22 Sep 2022 13:33:07 +0300 Subject: [PATCH 2/2] - Synchronize indicators in stats-db with monitor-db - added new openorg id for Nanyang Technological University - changed openorg id for University of Helsinki #8088 ticket --- .../scripts/step16-createIndicatorsTables.sql | 35 +++++++++++ .../scripts/step20-createMonitorDB.sql | 61 ++++++++++++------- 2 files changed, 74 insertions(+), 22 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index b8792ae61..417ed6e4e 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -92,6 +92,17 @@ compute stats indi_funded_result_with_fundref; -- -- compute stats indi_result_org_collab; -- +create table indi_result_org_collab stored as parquet as +with tmp as ( +select distinct ro.organization organization, ro.id from result_organization ro +join organization o on o.id=ro.organization where o.name is not null) +select o1.organization org1, o2.organization org2, count(o1.id) as collaborations +from tmp as o1 +join tmp as o2 on o1.id=o2.id and o1.organization!=o2.organization +group by org1, org2; + +compute stats indi_result_org_collab; + -- create table indi_result_org_country_collab stored as parquet as -- with tmp as -- (select o.id as id, o.country , ro.id as result,r.type from organization o @@ -105,6 +116,17 @@ compute stats indi_funded_result_with_fundref; -- -- compute stats indi_result_org_country_collab; -- +create table indi_result_org_country_collab stored as parquet as +with tmp as +(select distinct ro.organization organization, ro.id, o.country from result_organization ro +join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) +select o1.organization org1,o2.country country2, count(o1.id) as collaborations +from tmp as o1 join tmp as o2 on o1.id=o2.id +where o1.id=o2.id and o1.country!=o2.country +group by o1.organization, o1.id, o2.country; + +compute stats indi_result_org_country_collab; + -- create table indi_result_org_collab stored as parquet as -- with tmp as -- (select o.id, ro.id as result,r.type from organization o @@ -166,6 +188,19 @@ compute stats indi_funder_country_collab; -- -- compute stats indi_result_country_collab; +create table indi_result_country_collab stored as parquet as +with tmp as + (select distinct country, ro.id as result from organization o + join result_organization ro on o.id=ro.organization + where country <> 'UNKNOWN' and o.name is not null) +select o1.country country1, o2.country country2, count(o1.result) as collaborations +from tmp as o1 + join tmp as o2 on o1.result=o2.result +where o1.country<>o2.country +group by o1.country, o2.country; + +compute stats indi_result_country_collab; + ---- Sprint 4 ---- create table indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 5af2e9edd..290acbf9f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -23,7 +23,7 @@ create table TARGET.result stored as parquet as 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade - 'openorgs____::2fb1e47b4612688d9de9169d579939a7', --University of Helsinki + 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen @@ -48,7 +48,8 @@ create table TARGET.result stored as parquet as 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank - 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3' -- École des Ponts ParisTech + 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech + 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a' -- Nanyang Technological University ) )) foo; compute stats TARGET.result; @@ -154,50 +155,44 @@ create table TARGET.project_results stored as parquet as select id as result, pr compute stats TARGET.project_results; -- indicators +-- Sprint 1 ---- create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_green_oa; create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_grey_lit; create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_doi_from_crossref; -create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_gold_oa; ---create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---compute stats TARGET.indi_datasets_gold_oa; ---create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---compute stats TARGET.indi_software_gold_oa; -create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_has_abstract; +-- Sprint 2 ---- create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_has_cc_licence; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_has_cc_licence_url; - -create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab; -create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; -create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country; - +create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_has_abstract; create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_with_orcid; +---- Sprint 3 ---- create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_funded_result_with_fundref; +create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; +create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; +create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; +create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country; +create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab; +create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; +---- Sprint 4 ---- create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_diamond; -create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid; create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_in_transformative; create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_closed_other_open; - +---- Sprint 5 ---- create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_result_no_of_copies; - -create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; -create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; +---- Sprint 6 ---- create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_hybrid_oa_with_cc; - create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); compute stats TARGET.indi_pub_downloads; create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); @@ -206,6 +201,28 @@ create table TARGET.indi_pub_downloads_year stored as parquet as select * from S compute stats TARGET.indi_pub_downloads_year; create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); compute stats TARGET.indi_pub_downloads_datasource_year; +---- Sprint 7 ---- +create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_gold_oa; +create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.indi_pub_hybrid; +create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; +create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; +create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; +create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub; +create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year; +create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year; +create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; +create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; +create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; +create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_datasets_gold_oa; +--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_software_gold_oa; --denorm alter table TARGET.result rename to TARGET.res_tmp;