From 25d0512fbdfdafa79b531cab7460d6326b22609e Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 20 Apr 2021 01:43:23 +0300 Subject: [PATCH 1/5] code cleanup --- .../graph/stats/oozie_app/scripts/step16.sql | 31 +------------------ .../stats/oozie_app/scripts/step16_5.sql | 5 +-- .../scripts/step20-createMonitorDB.sql | 6 ---- 3 files changed, 2 insertions(+), 40 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql index 833deff73..481fd9e8c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql @@ -59,33 +59,4 @@ from result_gold union all select distinct r.id, false as gold from ${stats_db_name}.result r -where r.id not in (select id from result_gold); - --- shortcut result-country through the organization affiliation -create table ${stats_db_name}.result_affiliated_country as -select r.id as id, o.country as country -from ${stats_db_name}.result r -join ${stats_db_name}.result_organization ro on ro.id=r.id -join ${stats_db_name}.organization o on o.id=ro.organization -where o.country is not null and o.country!=''; - --- shortcut result-country through datasource of deposition -create table ${stats_db_name}.result_deposited_country as -select r.id as id, o.country as country -from ${stats_db_name}.result r -join ${stats_db_name}.result_datasources rd on rd.id=r.id -join ${stats_db_name}.datasource d on d.id=rd.datasource -join ${stats_db_name}.datasource_organizations dor on dor.id=d.id -join ${stats_db_name}.organization o on o.id=dor.organization -where o.country is not null and o.country!=''; - --- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +where r.id not in (select id from result_gold); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index 2bdc263ef..f737c1ea6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -52,7 +52,4 @@ LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; drop table if exists ${stats_db_name}.result; drop view if exists ${stats_db_name}.result; create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; -drop table ${stats_db_name}.result_tmp; --- --- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +drop table ${stats_db_name}.result_tmp; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 9477ada12..af5e2a6a4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -19,9 +19,6 @@ create table TARGET.result as select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) ) foo; compute stats TARGET.result; -create table TARGET.result_affiliated_country as select * from SOURCE.result_affiliated_country rac where exists (select 1 from TARGET.result r where r.id=rac.id); -compute stats TARGET.result_affiliated_country; - create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_citations; @@ -34,9 +31,6 @@ compute stats TARGET.result_concepts; create table TARGET.result_datasources as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_datasources; -create table TARGET.result_deposited_country as select * from SOURCE.result_deposited_country orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_deposited_country; - create table TARGET.result_fundercount as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_fundercount; From 625d993cd97b81c05fef8e71b05519a81aa85c18 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 20 Apr 2021 02:31:06 +0300 Subject: [PATCH 2/5] added step for observatory db --- .../oa/graph/stats/oozie_app/observatory.sh | 28 ++ .../scripts/step21-createObservatoryDB.sql | 259 ++++++++++++++++++ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 29 +- 3 files changed, 313 insertions(+), 3 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh new file mode 100644 index 000000000..ff03bca03 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh @@ -0,0 +1,28 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +echo "Creating observatory database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +echo "Impala shell finished" + +echo "Updating shadow observatory database" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql new file mode 100644 index 000000000..40cdf3f6d --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -0,0 +1,259 @@ +create table TARGET.result_affiliated_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; + +create table TARGET.result_affiliated_year stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; + +create table TARGET.result_affiliated_year_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; + +create table TARGET.result_affiliated_datasource stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_datasources rd on rd.id=r.id +left outer join SOURCE.datasource d on d.id=rd.datasource +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; + +create table TARGET.result_affiliated_datasource_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_datasources rd on rd.id=r.id +left outer join SOURCE.datasource d on d.id=rd.datasource +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; + +create table TARGET.result_affiliated_organization stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; + +create table TARGET.result_affiliated_organization_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; + +create table TARGET.result_affiliated_funder stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; + +create table TARGET.result_affiliated_funder_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; + +create table TARGET.result_deposited_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; + +create table TARGET.result_deposited_year stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; + +create table TARGET.result_deposited_year_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; + +create table TARGET.result_deposited_datasource stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; + +create table TARGET.result_deposited_datasource_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; + +create table TARGET.result_deposited_organization stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; + +create table TARGET.result_deposited_organization_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; + +create table TARGET.result_deposited_funder stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; + +create table TARGET.result_deposited_funder_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; + +compute stats TARGET.result_affiliated_country; +compute stats TARGET.result_affiliated_year; +compute stats TARGET.result_affiliated_year_country; +compute stats TARGET.result_affiliated_datasource; +compute stats TARGET.result_affiliated_datasource_country; +compute stats TARGET.result_affiliated_organization; +compute stats TARGET.result_affiliated_organization_country; +compute stats TARGET.result_affiliated_funder; +compute stats TARGET.result_affiliated_funder_country; +compute stats TARGET.result_deposited_country; +compute stats TARGET.result_deposited_year; +compute stats TARGET.result_deposited_year_country; +compute stats TARGET.result_deposited_datasource; +compute stats TARGET.result_deposited_datasource_country; +compute stats TARGET.result_deposited_organization; +compute stats TARGET.result_deposited_organization_country; +compute stats TARGET.result_deposited_funder; +compute stats TARGET.result_deposited_funder_country; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 321500e2c..824a8b3c7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -25,6 +25,14 @@ monitor_db_shadow_name the name of the shadow monitor db + + observatory_db_name + the target monitor db name + + + observatory_db_shadow_name + the name of the shadow monitor db + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache update. @@ -305,11 +313,26 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh - + - + + + ${jobTracker} + ${nameNode} + observatory.sh + ${stats_db_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + ${wf:appPath()}/scripts/step21-createObservatoryDB.sql + observatory.sh + + + + + + ${jobTracker} ${nameNode} @@ -322,4 +345,4 @@ - + \ No newline at end of file From 168edcbde32da360c92785852311ed651bb305c9 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 18 May 2021 15:23:20 +0300 Subject: [PATCH 3/5] added the final steps for the observatory promote wf and some cleanup --- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 21 ++++++++++++ .../graph/stats/oozie_app/scripts/step12.sql | 32 ------------------- 2 files changed, 21 insertions(+), 32 deletions(-) diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 0d8ff7ee3..8286e5039 100644 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -16,6 +16,14 @@ monitor_db_production_name the name of the monitor public database + + observatory_db_name + the monitor database name + + + observatory_db_production_name + the name of the monitor public database + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache promote. @@ -77,6 +85,19 @@ ${monitor_db_production_name} updateProductionViews.sh + + + + + + + ${jobTracker} + ${nameNode} + updateProductionViews.sh + ${observatory_db_name} + ${observatory_db_production_name} + updateProductionViews.sh + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 51d3a73c9..47d147f75 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -45,35 +45,3 @@ FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct; - - -------------------------------------------------------------------------------- --- To see with Antonis if the following is needed and where it should be placed -------------------------------------------------------------------------------- -CREATE TABLE ${stats_db_name}.numbers_country AS -SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications -FROM ${stats_db_name}.result r, - ${stats_db_name}.result_datasources rd, - ${stats_db_name}.datasource d, - ${stats_db_name}.datasource_organizations dor, - ${stats_db_name}.organization org -WHERE r.id = rd.id - AND rd.datasource = d.id - AND d.id = dor.id - AND dor.organization = org.id - AND r.type = 'publication' - AND r.bestlicence = 'Open Access' -GROUP BY org.country; - --- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file From d413b24611765245315203f862566aa24e07d973 Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 10 Jun 2021 02:35:46 +0300 Subject: [PATCH 4/5] added instances, orgs for monitor, totalcost for projects, apcs --- .../graph/stats/oozie_app/scripts/step11.sql | 3 ++- .../stats/oozie_app/scripts/step16_6.sql | 25 +++++++++++++------ .../scripts/step20-createMonitorDB.sql | 8 +++++- .../graph/stats/oozie_app/scripts/step6.sql | 19 ++++---------- 4 files changed, 32 insertions(+), 23 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index d26169fd6..b977302df 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -41,7 +41,8 @@ SELECT p.id, CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub, CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, p.callidentifier, - p.code + p.code, + p.totalcost FROM ${stats_db_name}.project_tmp p LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np FROM ${stats_db_name}.project_results pr diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql index 528aaff52..5280cf3e3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql @@ -30,10 +30,21 @@ from rcount group by rcount.pid; create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; --- --- ANALYZE TABLE ${stats_db_name}.result_projectcount COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_projectcount COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_fundercount COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_fundercount COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.project_resultcount COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.project_resultcount COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file + +create table ${stats_db_name}.result_instance as +select distinct r.* +from ( + select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom, + substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid + from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r +join ${stats_db_name}.result res on res.id=r.id; + +create table ${stats_db_name}.result_apc as +select r.id, r.amount, r.currency +from ( + select substr(r.id, 4) as id, inst.processingchargeamount.value as amount, inst.processingchargecurrency.value as currency + from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r +join ${stats_db_name}.result res on res.id=r.id +where r.amount is not null; + +create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index af5e2a6a4..74aa8536c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -16,7 +16,13 @@ create table TARGET.result as select distinct * from ( select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) union all - select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) ) foo; + select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) + union all + select * from SOURCE.result r where exists (select 1 from SOURCE.result_project rp join SOURCE.project p on p.id=rp.project join SOURCE.project_organizations po on po.id=p.id join SOURCE.organization o on o.id=po.organization where ro.id=r.id and o.name in ( + 'GEORG-AUGUST-UNIVERSITAT GOTTINGEN STIFTUNG OFFENTLICHEN RECHTS', + 'ATHINA-EREVNITIKO KENTRO KAINOTOMIAS STIS TECHNOLOGIES TIS PLIROFORIAS, TON EPIKOINONION KAI TIS GNOSIS', + 'Consiglio Nazionale delle Ricerche', + 'Universidade do Minho') )) foo; compute stats TARGET.result; create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 23ef03bc9..5d81e97bb 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -39,7 +39,8 @@ CREATE TABLE ${stats_db_name}.project_tmp daysforlastpub INT, delayedpubs INT, callidentifier STRING, - code STRING + code STRING, + totalcost FLOAT ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); INSERT INTO ${stats_db_name}.project_tmp @@ -62,7 +63,8 @@ SELECT substr(p.id, 4) AS id, 0 AS daysforlastpub, 0 AS delayedpubs, p.callidentifier.value AS callidentifier, - p.code.value AS code + p.code.value AS code, + p.totalcost AS totalcost FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference = false; @@ -70,15 +72,4 @@ create table ${stats_db_name}.funder as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; - --- ANALYZE TABLE ${stats_db_name}.project_oids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.project_oids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.project_organizations COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.project_organizations COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.project_results COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.project_results COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.project_tmp COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.project_tmp COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.funder COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.funder COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; \ No newline at end of file From f7c0b80e35d853b4594abaa5f4fcade48e7d4e21 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 15 Jun 2021 14:45:48 +0300 Subject: [PATCH 5/5] storing result_instance as parquet --- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql index 5280cf3e3..3a7d9f455 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql @@ -31,7 +31,7 @@ group by rcount.pid; create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; -create table ${stats_db_name}.result_instance as +create table ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( select substr(r.id, 4) as id, inst.accessright.classname as accessright, substr(inst.collectedfrom.key, 4) as collectedfrom,