From 625d993cd97b81c05fef8e71b05519a81aa85c18 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 20 Apr 2021 02:31:06 +0300 Subject: [PATCH] added step for observatory db --- .../oa/graph/stats/oozie_app/observatory.sh | 28 ++ .../scripts/step21-createObservatoryDB.sql | 259 ++++++++++++++++++ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 29 +- 3 files changed, 313 insertions(+), 3 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh new file mode 100644 index 000000000..ff03bca03 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh @@ -0,0 +1,28 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +echo "Creating observatory database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +echo "Impala shell finished" + +echo "Updating shadow observatory database" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql new file mode 100644 index 000000000..40cdf3f6d --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -0,0 +1,259 @@ +create table TARGET.result_affiliated_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; + +create table TARGET.result_affiliated_year stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; + +create table TARGET.result_affiliated_year_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; + +create table TARGET.result_affiliated_datasource stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_datasources rd on rd.id=r.id +left outer join SOURCE.datasource d on d.id=rd.datasource +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; + +create table TARGET.result_affiliated_datasource_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_datasources rd on rd.id=r.id +left outer join SOURCE.datasource d on d.id=rd.datasource +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; + +create table TARGET.result_affiliated_organization stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; + +create table TARGET.result_affiliated_organization_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; + +create table TARGET.result_affiliated_funder stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; + +create table TARGET.result_affiliated_funder_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; + +create table TARGET.result_deposited_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; + +create table TARGET.result_deposited_year stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; + +create table TARGET.result_deposited_year_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; + +create table TARGET.result_deposited_datasource stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; + +create table TARGET.result_deposited_datasource_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; + +create table TARGET.result_deposited_organization stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; + +create table TARGET.result_deposited_organization_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; + +create table TARGET.result_deposited_funder stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; + +create table TARGET.result_deposited_funder_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; + +compute stats TARGET.result_affiliated_country; +compute stats TARGET.result_affiliated_year; +compute stats TARGET.result_affiliated_year_country; +compute stats TARGET.result_affiliated_datasource; +compute stats TARGET.result_affiliated_datasource_country; +compute stats TARGET.result_affiliated_organization; +compute stats TARGET.result_affiliated_organization_country; +compute stats TARGET.result_affiliated_funder; +compute stats TARGET.result_affiliated_funder_country; +compute stats TARGET.result_deposited_country; +compute stats TARGET.result_deposited_year; +compute stats TARGET.result_deposited_year_country; +compute stats TARGET.result_deposited_datasource; +compute stats TARGET.result_deposited_datasource_country; +compute stats TARGET.result_deposited_organization; +compute stats TARGET.result_deposited_organization_country; +compute stats TARGET.result_deposited_funder; +compute stats TARGET.result_deposited_funder_country; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 321500e2c..824a8b3c7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -25,6 +25,14 @@ monitor_db_shadow_name the name of the shadow monitor db + + observatory_db_name + the target monitor db name + + + observatory_db_shadow_name + the name of the shadow monitor db + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache update. @@ -305,11 +313,26 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh - + - + + + ${jobTracker} + ${nameNode} + observatory.sh + ${stats_db_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + ${wf:appPath()}/scripts/step21-createObservatoryDB.sql + observatory.sh + + + + + + ${jobTracker} ${nameNode} @@ -322,4 +345,4 @@ - + \ No newline at end of file