From 2ebb1459a9c01eeda8d81865c75585e361cc5426 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 28 Sep 2022 14:36:57 +0300 Subject: [PATCH 1/8] Fixed type in no_downloads --- .../scripts/step16-createIndicatorsTables.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 417ed6e4e..1bda07629 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -454,16 +454,16 @@ FROM publication_datasources pd compute stats indi_pub_hybrid_oa_with_cc; create table indi_pub_downloads stored as parquet as -SELECT result_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats +SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id -order by no_dowloads desc; +order by no_downloads desc; compute stats indi_pub_downloads; create table indi_pub_downloads_datasource stored as parquet as -SELECT result_id, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats +SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id @@ -472,7 +472,7 @@ order by result_id; compute stats indi_pub_downloads_datasource; create table indi_pub_downloads_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us +SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join publication on result_id=id where downloads>0 GROUP BY result_id, `year` order by `year` asc; @@ -480,7 +480,7 @@ order by `year` asc; compute stats indi_pub_downloads_year; create table indi_pub_downloads_datasource_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us +SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id, `year` From bdc46e3eaab57a926977d162131d78933566c08b Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 28 Sep 2022 14:59:08 +0300 Subject: [PATCH 2/8] Remove denormalization of results to fix downloads numbers in monitor --- .../oozie_app/scripts/step20-createMonitorDB.sql | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 290acbf9f..2505c3a34 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -39,7 +39,6 @@ create table TARGET.result stored as parquet as 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) - 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly @@ -224,18 +223,3 @@ create table TARGET.indi_result_with_pid stored as parquet as select * from SOUR --create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_software_gold_oa; ---denorm -alter table TARGET.result rename to TARGET.res_tmp; - -create table TARGET.result_denorm stored as parquet as - select distinct r.*, rp.project, p.acronym as pacronym, p.title as ptitle, p.funder as pfunder, p.funding_lvl0 as pfunding_lvl0, rd.datasource, d.name as dname, d.type as dtype - from TARGET.res_tmp r - left outer join TARGET.result_projects rp on rp.id=r.id - left outer join TARGET.result_datasources rd on rd.id=r.id - left outer join TARGET.project p on p.id=rp.project - left outer join TARGET.datasource d on d.id=rd.datasource; -compute stats TARGET.result_denorm; - -alter table TARGET.result_denorm rename to TARGET.result; -drop table TARGET.res_tmp; ---- done! \ No newline at end of file From 2c0c3f18064da57d31f88639b41f94348a404b6e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 28 Sep 2022 19:33:24 +0300 Subject: [PATCH 3/8] Cast amount to float for table result_apcs --- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 04c7f83b9..86ead4a2c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -42,7 +42,7 @@ join ${stats_db_name}.result res on res.id=r.id; create table ${stats_db_name}.result_apc as select r.id, r.amount, r.currency from ( - select substr(r.id, 4) as id, inst.processingchargeamount.value as amount, inst.processingchargecurrency.value as currency + select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id where r.amount is not null; From 5df9c6396336e895708d00642ebb229825dc7665 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 27 Oct 2022 16:44:26 +0300 Subject: [PATCH 4/8] Added fields: totalcost, fundedamount, currency, in project table --- .../dhp/oa/graph/stats/oozie_app/scripts/step11.sql | 4 +++- .../dhp/oa/graph/stats/oozie_app/scripts/step13.sql | 8 ++++---- .../dhp/oa/graph/stats/oozie_app/scripts/step6.sql | 8 ++++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index d699b68c3..41c3ed751 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -42,7 +42,9 @@ SELECT p.id, CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, p.callidentifier, p.code, - p.totalcost + p.totalcost, + p.fundedamount, + p.currency FROM ${stats_db_name}.project_tmp p LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np FROM ${stats_db_name}.project_results pr diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index aee66fd5e..24e1a1355 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -59,7 +59,7 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; -create table ${stats_db_name}.result_orcid STORED AS PARQUET as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid from ( SELECT substr(res.id, 4) as id, auth_pid.value as orcid @@ -69,7 +69,7 @@ from ( LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; -create table ${stats_db_name}.result_result stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id @@ -82,7 +82,7 @@ where reltype='resultResult' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; -create table ${stats_db_name}.result_citations_oc stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as select substr(target, 4) as id, count(distinct substr(source, 4)) as citations from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id @@ -97,7 +97,7 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE group by substr(target, 4); -create table ${stats_db_name}.result_references_oc stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as select substr(source, 4) as id, count(distinct substr(target, 4)) as references from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 5461afde6..c31180c14 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -48,7 +48,9 @@ CREATE TABLE ${stats_db_name}.project_tmp delayedpubs INT, callidentifier STRING, code STRING, - totalcost FLOAT + totalcost FLOAT, + fundedamount FLOAT, + currency STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); INSERT INTO ${stats_db_name}.project_tmp @@ -72,7 +74,9 @@ SELECT substr(p.id, 4) AS id, 0 AS delayedpubs, p.callidentifier.value AS callidentifier, p.code.value AS code, - p.totalcost AS totalcost + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + p.currency.value AS currency FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; From 7861c472e0c8e4c3084a04721dafc736111c4963 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 28 Oct 2022 19:00:32 +0300 Subject: [PATCH 5/8] Hive memory parameters --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml index 9331d4ac5..63fc84d75 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml @@ -21,7 +21,7 @@ hive_jdbc_url - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228 oozie.wf.workflow.notification.url From 7fda05e3803bebd087c382411d88914e21c6f2db Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 1 Nov 2022 13:59:40 +0200 Subject: [PATCH 6/8] Added Autonomous University of Barcelona --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 2505c3a34..5dbedfa67 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -48,7 +48,8 @@ create table TARGET.result stored as parquet as 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech - 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a' -- Nanyang Technological University + 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University + 'openorgs____::4b34103bde246228fcd837f5f1bf4212' -- Autonomous University of Barcelona ) )) foo; compute stats TARGET.result; From 992fc5b628ccad566acf218019cf576408564f73 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 3 Nov 2022 11:02:18 +0200 Subject: [PATCH 7/8] Added McMaster University Institution --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 5dbedfa67..2bdcbfa3d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -49,7 +49,8 @@ create table TARGET.result stored as parquet as 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University - 'openorgs____::4b34103bde246228fcd837f5f1bf4212' -- Autonomous University of Barcelona + 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona + 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb' -- McMaster University ) )) foo; compute stats TARGET.result; From 2a52a42169ecdede8024fc9fcbadb1008e49f912 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 6 Dec 2022 10:10:21 +0200 Subject: [PATCH 8/8] Added 4 institutions: -University of Modena and Reggio Emilia -Bilkent University -Saints Cyril and Methodius University of Skopje -University of Milan --- .../stats/oozie_app/scripts/step20-createMonitorDB.sql | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 2bdcbfa3d..98dca7129 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -50,7 +50,11 @@ create table TARGET.result stored as parquet as 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona - 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb' -- McMaster University + 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University + 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia + 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University + 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje + 'openorgs____::db7686f30f22cbe73a4fde872ce812a6' -- University of Milan ) )) foo; compute stats TARGET.result;