From 83722ebc47764a2ac23a55102cdc7c2920823c86 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 28 May 2021 14:11:46 +0200 Subject: [PATCH] pull #111 replied on stable_ids --- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 21 ++ .../oa/graph/stats/oozie_app/observatory.sh | 28 ++ .../graph/stats/oozie_app/scripts/step12.sql | 32 --- .../graph/stats/oozie_app/scripts/step16.sql | 31 +-- .../stats/oozie_app/scripts/step16_5.sql | 5 +- .../scripts/step20-createMonitorDB.sql | 6 - .../scripts/step21-createObservatoryDB.sql | 259 ++++++++++++++++++ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 29 +- 8 files changed, 336 insertions(+), 75 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 0d8ff7ee32..8286e50391 100644 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -16,6 +16,14 @@ monitor_db_production_name the name of the monitor public database + + observatory_db_name + the monitor database name + + + observatory_db_production_name + the name of the monitor public database + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache promote. @@ -77,6 +85,19 @@ ${monitor_db_production_name} updateProductionViews.sh + + + + + + + ${jobTracker} + ${nameNode} + updateProductionViews.sh + ${observatory_db_name} + ${observatory_db_production_name} + updateProductionViews.sh + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh new file mode 100644 index 0000000000..ff03bca038 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh @@ -0,0 +1,28 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +echo "Creating observatory database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +echo "Impala shell finished" + +echo "Updating shadow observatory database" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 51d3a73c9e..47d147f756 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -45,35 +45,3 @@ FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct; - - -------------------------------------------------------------------------------- --- To see with Antonis if the following is needed and where it should be placed -------------------------------------------------------------------------------- -CREATE TABLE ${stats_db_name}.numbers_country AS -SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications -FROM ${stats_db_name}.result r, - ${stats_db_name}.result_datasources rd, - ${stats_db_name}.datasource d, - ${stats_db_name}.datasource_organizations dor, - ${stats_db_name}.organization org -WHERE r.id = rd.id - AND rd.datasource = d.id - AND d.id = dor.id - AND dor.organization = org.id - AND r.type = 'publication' - AND r.bestlicence = 'Open Access' -GROUP BY org.country; - --- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql index 833deff734..481fd9e8c2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql @@ -59,33 +59,4 @@ from result_gold union all select distinct r.id, false as gold from ${stats_db_name}.result r -where r.id not in (select id from result_gold); - --- shortcut result-country through the organization affiliation -create table ${stats_db_name}.result_affiliated_country as -select r.id as id, o.country as country -from ${stats_db_name}.result r -join ${stats_db_name}.result_organization ro on ro.id=r.id -join ${stats_db_name}.organization o on o.id=ro.organization -where o.country is not null and o.country!=''; - --- shortcut result-country through datasource of deposition -create table ${stats_db_name}.result_deposited_country as -select r.id as id, o.country as country -from ${stats_db_name}.result r -join ${stats_db_name}.result_datasources rd on rd.id=r.id -join ${stats_db_name}.datasource d on d.id=rd.datasource -join ${stats_db_name}.datasource_organizations dor on dor.id=d.id -join ${stats_db_name}.organization o on o.id=dor.organization -where o.country is not null and o.country!=''; - --- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +where r.id not in (select id from result_gold); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index 2bdc263ef5..f737c1ea61 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -52,7 +52,4 @@ LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; drop table if exists ${stats_db_name}.result; drop view if exists ${stats_db_name}.result; create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; -drop table ${stats_db_name}.result_tmp; --- --- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +drop table ${stats_db_name}.result_tmp; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 9477ada12f..af5e2a6a4d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -19,9 +19,6 @@ create table TARGET.result as select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) ) foo; compute stats TARGET.result; -create table TARGET.result_affiliated_country as select * from SOURCE.result_affiliated_country rac where exists (select 1 from TARGET.result r where r.id=rac.id); -compute stats TARGET.result_affiliated_country; - create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_citations; @@ -34,9 +31,6 @@ compute stats TARGET.result_concepts; create table TARGET.result_datasources as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_datasources; -create table TARGET.result_deposited_country as select * from SOURCE.result_deposited_country orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_deposited_country; - create table TARGET.result_fundercount as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_fundercount; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql new file mode 100644 index 0000000000..40cdf3f6d9 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -0,0 +1,259 @@ +create table TARGET.result_affiliated_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; + +create table TARGET.result_affiliated_year stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; + +create table TARGET.result_affiliated_year_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; + +create table TARGET.result_affiliated_datasource stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_datasources rd on rd.id=r.id +left outer join SOURCE.datasource d on d.id=rd.datasource +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; + +create table TARGET.result_affiliated_datasource_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_datasources rd on rd.id=r.id +left outer join SOURCE.datasource d on d.id=rd.datasource +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; + +create table TARGET.result_affiliated_organization stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; + +create table TARGET.result_affiliated_organization_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; + +create table TARGET.result_affiliated_funder stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; + +create table TARGET.result_affiliated_funder_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; + +create table TARGET.result_deposited_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; + +create table TARGET.result_deposited_year stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; + +create table TARGET.result_deposited_year_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; + +create table TARGET.result_deposited_datasource stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; + +create table TARGET.result_deposited_datasource_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; + +create table TARGET.result_deposited_organization stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; + +create table TARGET.result_deposited_organization_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; + +create table TARGET.result_deposited_funder stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; + +create table TARGET.result_deposited_funder_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; + +compute stats TARGET.result_affiliated_country; +compute stats TARGET.result_affiliated_year; +compute stats TARGET.result_affiliated_year_country; +compute stats TARGET.result_affiliated_datasource; +compute stats TARGET.result_affiliated_datasource_country; +compute stats TARGET.result_affiliated_organization; +compute stats TARGET.result_affiliated_organization_country; +compute stats TARGET.result_affiliated_funder; +compute stats TARGET.result_affiliated_funder_country; +compute stats TARGET.result_deposited_country; +compute stats TARGET.result_deposited_year; +compute stats TARGET.result_deposited_year_country; +compute stats TARGET.result_deposited_datasource; +compute stats TARGET.result_deposited_datasource_country; +compute stats TARGET.result_deposited_organization; +compute stats TARGET.result_deposited_organization_country; +compute stats TARGET.result_deposited_funder; +compute stats TARGET.result_deposited_funder_country; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 321500e2c2..824a8b3c7d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -25,6 +25,14 @@ monitor_db_shadow_name the name of the shadow monitor db + + observatory_db_name + the target monitor db name + + + observatory_db_shadow_name + the name of the shadow monitor db + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache update. @@ -305,11 +313,26 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh - + - + + + ${jobTracker} + ${nameNode} + observatory.sh + ${stats_db_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + ${wf:appPath()}/scripts/step21-createObservatoryDB.sql + observatory.sh + + + + + + ${jobTracker} ${nameNode} @@ -322,4 +345,4 @@ - + \ No newline at end of file