diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh
similarity index 63%
rename from dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh
rename to dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh
index 7db8d40a5..db8d39af2 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh
@@ -9,16 +9,7 @@ fi
export SOURCE=$1
export TARGET=$2
export SHADOW=$3
-export SCRIPT_PATH=$4
-echo "Getting file from " $4
-hdfs dfs -copyToLocal $4
-
-echo "Creating observatory database"
-impala-shell -q "drop database if exists ${TARGET} cascade"
-impala-shell -q "create database if not exists ${TARGET}"
-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
-cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | hive -f -
impala-shell -q "invalidate metadata;"
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f -
echo "Impala shell finished"
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh
new file mode 100644
index 000000000..92543b8b8
--- /dev/null
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh
@@ -0,0 +1,16 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+ rm -Rf "$link_folder"
+ ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export SOURCE=$1
+export TARGET=$2
+export SHADOW=$3
+
+echo "Creating observatory database"
+impala-shell -q "drop database if exists ${TARGET} cascade"
+impala-shell -q "create database if not exists ${TARGET}"
+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql
index f17b5358f..e0bdcd685 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql
@@ -1,14 +1,14 @@
-create table TARGET.result_cc_licence stored as parquet as
+create table ${observatory_db_name}.result_cc_licence stored as parquet as
select r.id, coalesce(rln.count, 0) > 0 as cc_licence
-from SOURCE.result r
+from ${stats_db_name}.result r
left outer join (
select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count
- from SOURCE.result_licenses rl
- left outer join SOURCE.licenses_normalized rln on rl.type=rln.license
+ from ${stats_db_name}.result_licenses rl
+ left outer join ${stats_db_name}.licenses_normalized rln on rl.type=rln.license
group by rl.id
) rln on rln.id=r.id;
-create table TARGET.result_affiliated_country stored as parquet as
+create table ${observatory_db_name}.result_affiliated_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -24,20 +24,20 @@ select
rfc.count > 1 as multiple_funders,
r.type,
c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
-create table TARGET.result_affiliated_year stored as parquet as
+create table ${observatory_db_name}.result_affiliated_year stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -53,20 +53,20 @@ select
rfc.count > 1 as multiple_funders,
r.type,
r.year
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
-create table TARGET.result_affiliated_year_country stored as parquet as
+create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -82,20 +82,20 @@ select
rfc.count > 1 as multiple_funders,
r.type,
r.year, c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
-create table TARGET.result_affiliated_datasource stored as parquet as
+create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -111,22 +111,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
d.name as dname
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_datasources rd on rd.id=r.id
- left outer join SOURCE.datasource d on d.id=rd.datasource
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ left outer join ${stats_db_name}.datasource d on d.id=rd.datasource
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
-create table TARGET.result_affiliated_datasource_country stored as parquet as
+create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -142,22 +142,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
d.name as dname, c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_datasources rd on rd.id=r.id
- left outer join SOURCE.datasource d on d.id=rd.datasource
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ left outer join ${stats_db_name}.datasource d on d.id=rd.datasource
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
-create table TARGET.result_affiliated_organization stored as parquet as
+create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -173,20 +173,20 @@ select
rfc.count > 1 as multiple_funders,
r.type,
o.name as oname
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
-create table TARGET.result_affiliated_organization_country stored as parquet as
+create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -202,20 +202,20 @@ select
rfc.count > 1 as multiple_funders,
r.type,
o.name as oname, c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
-create table TARGET.result_affiliated_funder stored as parquet as
+create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -231,22 +231,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
p.funder as pfunder
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- join SOURCE.result_projects rp on rp.id=r.id
- join SOURCE.project p on p.id=rp.project
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ join ${stats_db_name}.result_projects rp on rp.id=r.id
+ join ${stats_db_name}.project p on p.id=rp.project
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
-create table TARGET.result_affiliated_funder_country stored as parquet as
+create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -262,22 +262,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
p.funder as pfunder, c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_organization ro on ro.id=r.id
- join SOURCE.organization o on o.id=ro.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- join SOURCE.result_projects rp on rp.id=r.id
- join SOURCE.project p on p.id=rp.project
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_organization ro on ro.id=r.id
+ join ${stats_db_name}.organization o on o.id=ro.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ join ${stats_db_name}.result_projects rp on rp.id=r.id
+ join ${stats_db_name}.project p on p.id=rp.project
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
-create table TARGET.result_deposited_country stored as parquet as
+create table ${observatory_db_name}.result_deposited_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -293,22 +293,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
-create table TARGET.result_deposited_year stored as parquet as
+create table ${observatory_db_name}.result_deposited_year stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -324,22 +324,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
r.year
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
-create table TARGET.result_deposited_year_country stored as parquet as
+create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -355,22 +355,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
r.year, c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
-create table TARGET.result_deposited_datasource stored as parquet as
+create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -386,22 +386,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
d.name as dname
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
-create table TARGET.result_deposited_datasource_country stored as parquet as
+create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -417,22 +417,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
d.name as dname, c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
-create table TARGET.result_deposited_organization stored as parquet as
+create table ${observatory_db_name}.result_deposited_organization stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -448,22 +448,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
o.name as oname
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
-create table TARGET.result_deposited_organization_country stored as parquet as
+create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -479,22 +479,22 @@ select
rfc.count > 1 as multiple_funders,
r.type,
o.name as oname, c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
-create table TARGET.result_deposited_funder stored as parquet as
+create table ${observatory_db_name}.result_deposited_funder stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -510,24 +510,24 @@ select
rfc.count > 1 as multiple_funders,
r.type,
p.funder as pfunder
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- join SOURCE.result_projects rp on rp.id=r.id
- join SOURCE.project p on p.id=rp.project
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ join ${stats_db_name}.result_projects rp on rp.id=r.id
+ join ${stats_db_name}.project p on p.id=rp.project
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
-create table TARGET.result_deposited_funder_country stored as parquet as
+create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
select
count(distinct r.id) as total,
r.green,
@@ -543,38 +543,19 @@ select
rfc.count > 1 as multiple_funders,
r.type,
p.funder as pfunder, c.code as ccode, c.name as cname
-from SOURCE.result r
- join SOURCE.result_datasources rd on rd.id=r.id
- join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
- join SOURCE.datasource_organizations dor on dor.id=d.id
- join SOURCE.organization o on o.id=dor.organization
- join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
- join SOURCE.result_projects rp on rp.id=r.id
- join SOURCE.project p on p.id=rp.project
- left outer join SOURCE.result_licenses rl on rl.id=r.id
- left outer join SOURCE.result_pids pids on pids.id=r.id
- left outer join SOURCE.result_cc_licence rln on rln.id=r.id
- left outer join SOURCE.result_projectcount rpc on rpc.id=r.id
- left outer join SOURCE.result_fundercount rfc on rfc.id=r.id
+from ${stats_db_name}.result r
+ join ${stats_db_name}.result_datasources rd on rd.id=r.id
+ join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
+ join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
+ join ${stats_db_name}.organization o on o.id=dor.organization
+ join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
+ join ${stats_db_name}.result_projects rp on rp.id=r.id
+ join ${stats_db_name}.project p on p.id=rp.project
+ left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
+ left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
+ left outer join ${stats_db_name}.result_cc_licence rln on rln.id=r.id
+ left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
+ left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
- cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
-
--- compute stats TARGET.result_affiliated_country;
--- compute stats TARGET.result_affiliated_year;
--- compute stats TARGET.result_affiliated_year_country;
--- compute stats TARGET.result_affiliated_datasource;
--- compute stats TARGET.result_affiliated_datasource_country;
--- compute stats TARGET.result_affiliated_organization;
--- compute stats TARGET.result_affiliated_organization_country;
--- compute stats TARGET.result_affiliated_funder;
--- compute stats TARGET.result_affiliated_funder_country;
--- compute stats TARGET.result_deposited_country;
--- compute stats TARGET.result_deposited_year;
--- compute stats TARGET.result_deposited_year_country;
--- compute stats TARGET.result_deposited_datasource;
--- compute stats TARGET.result_deposited_datasource_country;
--- compute stats TARGET.result_deposited_organization;
--- compute stats TARGET.result_deposited_organization_country;
--- compute stats TARGET.result_deposited_funder;
--- compute stats TARGET.result_deposited_funder_country;
\ No newline at end of file
+ cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
index 8fe05a933..08d33f4e8 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
@@ -326,20 +326,44 @@
${wf:appPath()}/scripts/step20-createMonitorDB.sql
monitor.sh
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ observatory-pre.sh
+ ${stats_db_name}
+ ${observatory_db_name}
+ ${observatory_db_shadow_name}
+ observatory-pre.sh
+
+
+ ${hive_jdbc_url}
+
+ stats_db_name=${stats_db_name}
+ observatory_db_name=${observatory_db_name}
+
+
+
+
+
+
${jobTracker}
${nameNode}
- observatory.sh
+ observatory-post.sh
${stats_db_name}
${observatory_db_name}
${observatory_db_shadow_name}
- ${wf:appPath()}/scripts/step21-createObservatoryDB.sql
- observatory.sh
+ observatory-post.sh