From a94a54a2d0a9bdd9e6178f7f63b5369bd639d440 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 15 Nov 2023 14:32:18 +0200 Subject: [PATCH 1/4] Changes for tables and creation of the new indicator indi_is_result_accessible - Drop table statements for all tables to avoid duplicates in case of wf rerun - Add pdfsaggregated step to create the indi_is_result_accessible table. This step is executed on the new impala cluster only, since the pdfaggregation_i is updated on this cluster. --- .../oozie_app/copyDataToImpalaCluster.sh | 1 - .../stats/oozie_app/createPDFsAggregated.sh | 42 +++++ .../graph/stats/oozie_app/scripts/step10.sql | 2 + .../graph/stats/oozie_app/scripts/step11.sql | 1 + .../graph/stats/oozie_app/scripts/step12.sql | 10 ++ .../graph/stats/oozie_app/scripts/step13.sql | 17 +- .../graph/stats/oozie_app/scripts/step14.sql | 16 +- .../graph/stats/oozie_app/scripts/step15.sql | 12 ++ .../stats/oozie_app/scripts/step15_5.sql | 10 ++ .../scripts/step16-createIndicatorsTables.sql | 18 +- .../scripts/step16_1-definitions.sql | 6 + .../stats/oozie_app/scripts/step16_5.sql | 1 + .../graph/stats/oozie_app/scripts/step2.sql | 17 ++ .../scripts/step20-createMonitorDB.sql | 155 +++--------------- .../scripts/step20-createMonitorDBAll.sql | 85 +++------- .../scripts/step20-createMonitorDB_RIs.sql | 3 +- .../step20-createMonitorDB_RIs_tail.sql | 3 +- .../scripts/step20-createMonitorDB_funded.sql | 4 +- .../step20-createMonitorDB_institutions.sql | 4 +- .../scripts/step21-createObservatoryDB.sql | 33 ---- .../graph/stats/oozie_app/scripts/step7.sql | 6 + .../graph/stats/oozie_app/scripts/step8.sql | 11 ++ .../graph/stats/oozie_app/scripts/step9.sql | 2 + .../dhp/oa/graph/stats/oozie_app/workflow.xml | 17 ++ 24 files changed, 234 insertions(+), 242 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/createPDFsAggregated.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 431978997..18ff6dca8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -6,7 +6,6 @@ then ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} fi -#export HADOOP_USER_NAME="dimitris.pierrakos" export HADOOP_USER_NAME=$6 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" function copydb() { diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/createPDFsAggregated.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/createPDFsAggregated.sh new file mode 100644 index 000000000..46631a0c2 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/createPDFsAggregated.sh @@ -0,0 +1,42 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +function createPDFsAggregated() { + db=$1 + +impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table if exists indi_is_result_accessible"; + +impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "create table indi_is_result_accessible stored as parquet as + select distinct p.id, coalesce(is_result_accessible, 0) as is_result_accessible from result p + left outer join + (select id, 1 as is_result_accessible from (select pl.* from result r + join pdfaggregation_i.publication p on r.id=p.id + join pdfaggregation_i.payload pl on pl.id=p.id + union all + select pl.* from result r + join pdfaggregation_i.publication p on r.id=p.dedupid + join pdfaggregation_i.payload pl on pl.id=p.id) foo) tmp on p.id=tmp.id"; +} + +STATS_DB=$1 +MONITOR_DB=$2 +HADOOP_USER_NAME=$3 + +createPDFsAggregated $STATS_DB +createPDFsAggregated $MONITOR_DB + +createPDFsAggregated $MONITOR_DB'_funded' +createPDFsAggregated $MONITOR_DB'_institutions' +createPDFsAggregated $MONITOR_DB'_ris_tail' + +contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" +for i in ${contexts} +do + tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` + createPDFsAggregated ${MONITOR_DB}'_'${tmp} +done \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 92dedf243..bbd7b3bbc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -49,5 +49,7 @@ select * from openaire_prod_usage_stats.views_stats; -- Creation date of the database ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ +DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; + create table ${stats_db_name}.creation_date STORED AS PARQUET as select date_format(current_date(), 'dd-MM-yyyy') as date; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 41c3ed751..638fb0f7a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -20,6 +20,7 @@ WHERE project_tmp.id IN (SELECT pr.id ${stats_db_name}.result r WHERE pr.result = r.id AND r.type = 'publication'); +DROP TABLE IF EXISTS ${stats_db_name}.stored purge; CREATE TABLE ${stats_db_name}.project stored as parquet as SELECT p.id, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 47d147f75..0a1904de7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -1,22 +1,32 @@ ------------------------------------------------------------------------------------------------------ -- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables ------------------------------------------------------------------------------------------------------ +DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; + CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS SELECT * FROM ${stats_db_name}.datasource_tmp; +DROP TABLE IF EXISTS ${stats_db_name}.publication purge; + CREATE TABLE ${stats_db_name}.publication stored AS parquet AS SELECT * FROM ${stats_db_name}.publication_tmp; +DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; + CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS SELECT * FROM ${stats_db_name}.dataset_tmp; +DROP TABLE IF EXISTS ${stats_db_name}.software purge; + CREATE TABLE ${stats_db_name}.software stored AS parquet AS SELECT * FROM ${stats_db_name}.software_tmp; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS SELECT * FROM ${stats_db_name}.otherresearchproduct_tmp; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 24e1a1355..6493fa7d0 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -5,6 +5,8 @@ -- Sources related tables/views ------------------------------------------------------ ------------------------------------------------------ +DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( @@ -16,6 +18,8 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( @@ -27,6 +31,8 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( @@ -38,6 +44,8 @@ LEFT OUTER JOIN from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( @@ -48,7 +56,7 @@ LEFT OUTER JOIN SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; - + CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS SELECT * FROM ${stats_db_name}.publication_sources UNION ALL @@ -58,6 +66,7 @@ SELECT * FROM ${stats_db_name}.software_sources UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; +DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid @@ -69,6 +78,8 @@ from ( LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; +DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype from ${openaire_db_name}.relation rel @@ -82,6 +93,8 @@ where reltype='resultResult' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; +DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as select substr(target, 4) as id, count(distinct substr(source, 4)) as citations from ${openaire_db_name}.relation rel @@ -97,6 +110,8 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE group by substr(target, 4); +DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as select substr(source, 4) as id, count(distinct substr(target, 4)) as references from ${openaire_db_name}.relation rel diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 39755d68e..f50c13521 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -5,21 +5,29 @@ -- Licences related tables/views ------------------------------------------------------ ------------------------------------------------------ +DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses @@ -34,10 +42,14 @@ SELECT * FROM ${stats_db_name}.software_licenses UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; +DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; +DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource FROM ( @@ -48,6 +60,8 @@ FROM ( from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 4a8f81943..066b197e6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -6,21 +6,29 @@ ------------------------------------------------------ ------------------------------------------------------ +DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; +DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as select substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst @@ -35,12 +43,16 @@ select * from ${stats_db_name}.software_refereed union all select * from ${stats_db_name}.otherresearchproduct_refereed; +DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; + create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids where measures_ids.id!='views' and measures_ids.id!='downloads'; +DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; + create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, cast(rel.properties[0].value as double) apc_amount, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 615f523ce..2c606fb92 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -1,6 +1,8 @@ ------------------------------------------- --- Extra tables, mostly used by indicators +DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; + create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as select r.id, count(distinct p.id) as count from ${stats_db_name}.result r @@ -8,6 +10,8 @@ left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project group by r.id; +DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; + create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as select r.id, count(distinct p.funder) as count from ${stats_db_name}.result r @@ -15,6 +19,8 @@ left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project group by r.id; +DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; + create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as with rcount as ( select p.id as pid, count(distinct r.id) as `count`, r.type as type @@ -37,6 +43,8 @@ create or replace view ${stats_db_name}.totalresearchersft as select * from stat create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; +DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; + create table if not exists ${stats_db_name}.result_instance stored as parquet as select distinct r.* from ( @@ -45,6 +53,8 @@ from ( from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; +DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; + create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as select r.id, r.amount, r.currency from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 6af486340..8180e6527 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -104,7 +104,7 @@ from ${stats_db_name}.tmp as o1 join ${stats_db_name}.tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name group by o1.organization, o2.organization, o1.name, o2.name; -drop table ${stats_db_name}.tmp purge; +drop table if exists ${stats_db_name}.tmp purge; create TEMPORARY TABLE ${stats_db_name}.tmp AS select distinct ro.organization organization, ro.id, o.name, o.country from ${stats_db_name}.result_organization ro @@ -118,7 +118,7 @@ from ${stats_db_name}.tmp as o1 join ${stats_db_name}.tmp as o2 on o1.id=o2.id where o1.id=o2.id and o1.country!=o2.country group by o1.organization, o1.id, o1.name, o2.country; -drop table ${stats_db_name}.tmp purge; +drop table if exists ${stats_db_name}.tmp purge; create TEMPORARY TABLE ${stats_db_name}.tmp AS select o.id organization, o.name, ro.project as project from ${stats_db_name}.organization o @@ -133,7 +133,7 @@ from ${stats_db_name}.tmp as o1 where o1.organization<>o2.organization and o1.name<>o2.name group by o1.name,o2.name, o1.organization, o2.organization; -drop table ${stats_db_name}.tmp purge; +drop table if exists ${stats_db_name}.tmp purge; create TEMPORARY TABLE ${stats_db_name}.tmp AS select o.id organization, o.name, o.country , ro.project as project from ${stats_db_name}.organization o @@ -149,7 +149,7 @@ from ${stats_db_name}.tmp as o1 where o1.organization<>o2.organization and o1.country<>o2.country group by o1.organization, o2.country, o1.name; -drop table ${stats_db_name}.tmp purge; +drop table if exists ${stats_db_name}.tmp purge; drop table if exists ${stats_db_name}.indi_funder_country_collab purge; @@ -178,7 +178,7 @@ from ${stats_db_name}.tmp as o1 where o1.country<>o2.country group by o1.country, o2.country; -drop table ${stats_db_name}.tmp purge; +drop table if exists ${stats_db_name}.tmp purge; ---- Sprint 4 ---- drop table if exists ${stats_db_name}.indi_pub_diamond purge; @@ -422,7 +422,7 @@ drop table if exists ${stats_db_name}.indi_pub_hybrid purge; -- on pd.id=tmp.id; create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as -select pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication_datasources pd +select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication_datasources pd left outer join (select pd.id, 1 as is_hybrid from ${stats_db_name}.publication_datasources pd join ${stats_db_name}.datasource d on pd.datasource=d.id join ${stats_db_name}.result_instance ri on ri.id=pd.id @@ -492,7 +492,7 @@ CREATE TEMPORARY TABLE ${stats_db_name}.allresults as select year, ro.organizati drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as -select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness +select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from ${stats_db_name}.allresults join ${stats_db_name}.result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; @@ -813,8 +813,8 @@ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; --and (d.type='Journal' or d.type='Journal Aggregator/Publisher') --and ri.accessright='Open Access') tmp on tmp.id=p.id; -create table ${stats_db_name}.indi_pub_bronze stored as parquet as -select pd.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication_datasources pd +create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as +select distinct pd.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication_datasources pd left outer join (select pd.id, 1 as is_bronze_oa from ${stats_db_name}.publication_datasources pd join ${stats_db_name}.datasource d on pd.datasource=d.id join ${stats_db_name}.result_instance ri on ri.id=pd.id diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 41c95758c..b55af13d4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -3,6 +3,8 @@ ---------------------------------------------------- -- Peer reviewed: +drop table if exists ${stats_db_name}.result_peerreviewed purge; + create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r @@ -10,12 +12,16 @@ left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; -- Green OA: +drop table if exists ${stats_db_name}.result_greenoa purge; + create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as select r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; -- GOLD OA: +drop table if exists ${stats_db_name}.result_gold purge; + create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as select r.id, case when gold.is_gold=1 then true else false end as gold from ${stats_db_name}.result r diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index f737c1ea6..7faa91697 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -1,6 +1,7 @@ -- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, -- peer reviewed) drop table if exists ${stats_db_name}.result_tmp; + CREATE TABLE ${stats_db_name}.result_tmp ( id STRING, title STRING, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 4ffbd384b..8e56f98fc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -5,6 +5,7 @@ -------------------------------------------------------------- -- Publication temporary table +DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; CREATE TABLE ${stats_db_name}.publication_tmp ( id STRING, @@ -40,12 +41,16 @@ SELECT substr(p.id, 4) as id, from ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; + CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; + CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id @@ -55,6 +60,8 @@ from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; + CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( @@ -66,29 +73,39 @@ FROM ( from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; + CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; + CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; + CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; + CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; + CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index d5d242230..b52abd865 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -1,79 +1,3 @@ ---drop database if exists TARGET cascade; ---create database if not exists TARGET; --- ---create view if not exists TARGET.category as select * from SOURCE.category; ---create view if not exists TARGET.concept as select * from SOURCE.concept; ---create view if not exists TARGET.context as select * from SOURCE.context; ---create view if not exists TARGET.country as select * from SOURCE.country; ---create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; ---create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; ---create view if not exists TARGET.funder as select * from SOURCE.funder; ---create view if not exists TARGET.fundref as select * from SOURCE.fundref; ---create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; ---create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; ---create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; ---create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; ---create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; ---create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; --- ---create table TARGET.result stored as parquet as --- select distinct * from ( --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) --- union all --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) --- union all --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( --- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" --- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council --- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? --- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University --- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade --- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki --- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho --- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid --- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen --- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens --- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot --- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University --- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark --- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin --- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt --- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven --- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape --- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute --- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University --- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg --- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) --- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr --- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw --- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly --- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete --- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus --- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras --- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki --- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank --- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech --- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University --- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona --- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University --- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia --- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University --- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje --- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan --- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork --- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University --- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech --- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town --- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin --- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology --- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba --- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili --- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University --- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique --- ) )) foo; --- ---ANALYZE TABLE TARGET.result COMPUTE STATISTICS; - create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.context as select * from SOURCE.context; @@ -91,76 +15,52 @@ create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; ---ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS; create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); @@ -169,7 +69,6 @@ create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; ---ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; @@ -178,7 +77,6 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; ---ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; -- organizations create view if not exists TARGET.organization as select * from SOURCE.organization; @@ -196,28 +94,27 @@ create view if not exists TARGET.project_classification as select * from SOURCE. create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; ---ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; -- indicators -- Sprint 1 ---- create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; + create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; + create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; + -- Sprint 2 ---- create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; + create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; + create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; + create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; + ---- Sprint 3 ---- create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; + create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; @@ -226,32 +123,32 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; ---- Sprint 4 ---- create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; + create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; + create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; + ---- Sprint 5 ---- create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; + ---- Sprint 6 ---- create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; + create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS; + create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); ---ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; + create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); ---ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; + create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); ---ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; + create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); ---ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; + ---- Sprint 7 ---- create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; + create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; + create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; @@ -262,17 +159,17 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; + create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; + create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; + create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS; + create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS; + create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS; + create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_is_funder_plan_s stored as parquet as select * from SOURCE.indi_is_funder_plan_s orig where exists (select 1 from TARGET.result r where r.id=orig.id); create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index df4795e3e..2b6a68514 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -81,8 +81,6 @@ create table TARGET.result stored as parquet as 'openorgs____::8839b55dae0c84d56fd533f52d5d483a' -- Leibniz Institute of Ecological Urban and Regional Development ) )) foo; ---ANALYZE TABLE TARGET.result COMPUTE STATISTICS; - create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.context as select * from SOURCE.context; @@ -97,86 +95,63 @@ create view if not exists TARGET.doctoratestudents as select * from SOURCE.docto create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; ---create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates; +create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; ---ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS; + +create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; ---ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; @@ -185,7 +160,6 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; ---ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; -- organizations create view if not exists TARGET.organization as select * from SOURCE.organization; @@ -203,28 +177,26 @@ create view if not exists TARGET.project_classification as select * from SOURCE. create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; ---ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; -- indicators -- Sprint 1 ---- create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; + create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; + create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; + -- Sprint 2 ---- create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; + create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; + create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; + create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; + ---- Sprint 3 ---- create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; @@ -233,32 +205,29 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; ---- Sprint 4 ---- create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; + create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; + create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; + create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS; + create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); ---ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; + create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); ---ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; + create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); ---ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; + create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); ---ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; + ---- Sprint 7 ---- create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; + create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; @@ -269,19 +238,19 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; + create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; + create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; + create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS; + create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS; + create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS; -create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.id); -create table TARGET.indi_is_funder_plan_s stored as parquet as select * from SOURCE.indi_is_funder_plan_s orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s; create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness; create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess; create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql index 9a9407c2d..4469782f0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql @@ -11,5 +11,4 @@ create table TARGET.result stored as parquet as join SOURCE.context cont on cont.id=cat.context -- join SOURCE.result where rc.id=r.id and conc.category like CONTEXT) -) foo; ---ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file +) foo; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql index bad18efde..a28206d56 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql @@ -11,5 +11,4 @@ create table TARGET.result stored as parquet as join SOURCE.context cont on cont.id=cat.context -- join SOURCE.result where rc.id=r.id and conc.category not in (CONTEXTS)) -) foo; ---ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file +) foo; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql index b8d3c0242..ce6475c22 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql @@ -4,6 +4,4 @@ create database if not exists TARGET; create table TARGET.result stored as parquet as select distinct * from ( select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) - ) foo; - ---ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file + ) foo; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql index 7bfba92a8..d2f08b391 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql @@ -59,6 +59,4 @@ create table TARGET.result stored as parquet as 'openorgs____::5d55fb216b14691cf68218daf5d78cd9', -- Munster Technological University 'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University 'openorgs____::8839b55dae0c84d56fd533f52d5d483a' -- Leibniz Institute of Ecological Urban and Regional Development - ))) foo; - ---ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file + ))) foo; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index b7e421813..2e6f0711c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -8,7 +8,6 @@ from ${stats_db_name}.result r group by rl.id ) rln on rln.id=r.id; ---ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_country stored as parquet as select @@ -39,7 +38,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_year stored as parquet as select @@ -70,7 +68,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as select @@ -101,7 +98,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS; create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as select @@ -134,8 +130,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS; - create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -167,8 +161,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS; - create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, @@ -198,8 +190,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS; - create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, @@ -229,8 +219,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS; - create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, @@ -262,8 +250,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS; - create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, @@ -295,8 +281,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS; - create table ${observatory_db_name}.result_deposited_country stored as parquet as select count(distinct r.id) as total, @@ -328,8 +312,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS; - create table ${observatory_db_name}.result_deposited_year stored as parquet as select count(distinct r.id) as total, @@ -361,7 +343,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; ---ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS; create table ${observatory_db_name}.result_deposited_year_country stored as parquet as select @@ -394,8 +375,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS; - create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, @@ -427,8 +406,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; ---ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS; - create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -460,8 +437,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS; - create table ${observatory_db_name}.result_deposited_organization stored as parquet as select count(distinct r.id) as total, @@ -493,8 +468,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; ---ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS; - create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, @@ -526,8 +499,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; ---ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS; - create table ${observatory_db_name}.result_deposited_funder stored as parquet as select count(distinct r.id) as total, @@ -561,8 +532,6 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; ---ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS; - create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, @@ -595,5 +564,3 @@ from ${stats_db_name}.result r group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; - ---ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 1514ecf52..eb16a161e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -123,6 +123,8 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; +DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; + create table ${stats_db_name}.result_fos stored as parquet as with lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), @@ -133,6 +135,8 @@ from lvl1 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); +DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; + CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r @@ -140,6 +144,8 @@ WHERE r.reltype = 'resultOrganization' and r.target like '50|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; + CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 248716b36..07204db0c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -5,6 +5,8 @@ -- Datasource table/view and Datasource related tables/views ------------------------------------------------------------ ------------------------------------------------------------ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; + CREATE TABLE ${stats_db_name}.datasource_tmp ( `id` string, @@ -48,6 +50,7 @@ WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert + CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); INSERT INTO ${stats_db_name}.dual VALUES ('X'); @@ -74,16 +77,22 @@ DROP TABLE ${stats_db_name}.dual; UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; + CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; + CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; + CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r @@ -91,6 +100,8 @@ WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = f -- datasource sources: -- where the datasource info have been collected from. +DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; + create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index 3da36dfe5..19d301e27 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -3,6 +3,8 @@ -- Organization table/view and Organization related tables/views ---------------------------------------------------------------- ---------------------------------------------------------------- +DROP TABLE IF EXISTS ${stats_db_name}.organization purge; + CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS SELECT substr(o.id, 4) as id, o.legalname.value as name, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index aa991730b..cbf97944d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -114,6 +114,7 @@ ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'} ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'} ${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'} + ${wf:conf('resumeFrom') eq 'step22a-createPDFsAggregated'} ${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'} ${wf:conf('resumeFrom') eq 'Step24-updateCache'} @@ -448,6 +449,22 @@ ${hadoop_user_name} copyDataToImpalaCluster.sh + + + + + + + ${jobTracker} + ${nameNode} + createPDFsAggregated.sh + + + ${stats_db_name} + ${monitor_db_name} + ${hadoop_user_name} + createPDFsAggregated.sh + From 76594ded23455ecf67addcbebd3b4ca45b73199e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 1 Dec 2023 13:38:19 +0200 Subject: [PATCH 2/4] Changes to indicators Fixes on open access colours indicators - indi_pub_green_oa - indi_pub_gold_oa - indi_pub_hybrid - indi_pub_bronze_oa - indi_pub_diamond --- .../scripts/step16-createIndicatorsTables.sql | 129 +++++++++++++----- .../scripts/step20-createMonitorDB.sql | 2 + .../scripts/step20-createMonitorDBAll.sql | 2 + .../graph/stats/oozie_app/scripts/step3.sql | 18 +++ .../graph/stats/oozie_app/scripts/step4.sql | 15 ++ .../graph/stats/oozie_app/scripts/step5.sql | 16 +++ .../graph/stats/oozie_app/scripts/step6.sql | 14 ++ 7 files changed, 162 insertions(+), 34 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 8180e6527..fea449de6 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,6 +1,18 @@ -- Sprint 1 ---- drop table if exists ${stats_db_name}.indi_pub_green_oa purge; +--create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as +--select distinct p.id, coalesce(green_oa, 0) as green_oa +--from ${stats_db_name}.publication p +-- left outer join ( +-- select p.id, 1 as green_oa +-- from ${stats_db_name}.publication p +-- join ${stats_db_name}.result_instance ri on ri.id = p.id +-- join ${stats_db_name}.datasource on datasource.id = ri.hostedby +-- where datasource.type like '%Repository%' +-- and (ri.accessright = 'Open Access' +-- or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp +-- on p.id= tmp.id; create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa from ${stats_db_name}.publication p @@ -11,7 +23,7 @@ from ${stats_db_name}.publication p join ${stats_db_name}.datasource on datasource.id = ri.hostedby where datasource.type like '%Repository%' and (ri.accessright = 'Open Access' - or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp + or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and datasource.name!='Other') tmp on p.id= tmp.id; drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; @@ -183,15 +195,24 @@ drop table if exists ${stats_db_name}.tmp purge; ---- Sprint 4 ---- drop table if exists ${stats_db_name}.indi_pub_diamond purge; +--create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as +--select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal +--from ${stats_db_name}.publication_datasources pd +-- left outer join ( +-- select pd.id, 1 as in_diamond_journal from ${stats_db_name}.publication_datasources pd +-- join ${stats_db_name}.datasource d on d.id=pd.datasource +-- join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) +-- and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp +-- on pd.id=tmp.id; + create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from ${stats_db_name}.publication_datasources pd - left outer join ( - select pd.id, 1 as in_diamond_journal from ${stats_db_name}.publication_datasources pd - join ${stats_db_name}.datasource d on d.id=pd.datasource - join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) - and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp - on pd.id=tmp.id; +left outer join (select pd.id, 1 as in_diamond_journal from ${stats_db_name}.publication_datasources pd +join ${stats_db_name}.datasource d on d.id=pd.datasource +join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) +and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp +on pd.id=tmp.id; drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; @@ -312,28 +333,55 @@ drop table if exists ${stats_db_name}.indi_pub_gold_oa purge; -- JOIN gold_oa on issn.issn = gold_oa.issn) tmp -- on pd.id=tmp.id; +--create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet as +--with gold_oa as ( +--SELECT issn,issn_l from stats_ext.issn_gold_oa_dataset_v5), +--issn AS (SELECT * FROM +--(SELECT id,issn_printed as issn FROM ${stats_db_name}.datasource +--WHERE issn_printed IS NOT NULL +--UNION ALL +--SELECT id, issn_online as issn FROM ${stats_db_name}.datasource +--WHERE issn_online IS NOT NULL or id like '%doajarticles%') as issn +--WHERE LENGTH(issn) > 7), +--alljournals AS(select issn, issn_l from stats_ext.alljournals +--where journal_is_in_doaj=true or journal_is_oa=true) +--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold +--FROM ${stats_db_name}.publication_datasources pd +--left outer join ( +--select pd.id, 1 as is_gold FROM ${stats_db_name}.publication_datasources pd +--JOIN issn on issn.id=pd.datasource +--JOIN gold_oa on issn.issn = gold_oa.issn +--join alljournals on issn.issn=alljournals.issn +--left outer join ${stats_db_name}.result_instance ri on ri.id=pd.id +--and ri.accessright!='Closed Access' and ri.accessright_uw='gold') tmp +--on pd.id=tmp.id; create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet as with gold_oa as ( -SELECT issn,issn_l from stats_ext.issn_gold_oa_dataset_v5), -issn AS (SELECT * FROM -(SELECT id,issn_printed as issn FROM ${stats_db_name}.datasource -WHERE issn_printed IS NOT NULL -UNION ALL -SELECT id, issn_online as issn FROM ${stats_db_name}.datasource -WHERE issn_online IS NOT NULL or id like '%doajarticles%') as issn -WHERE LENGTH(issn) > 7), -alljournals AS(select issn, issn_l from stats_ext.alljournals -where journal_is_in_doaj=true or journal_is_oa=true) +select distinct issn from ( + SELECT issn_l as issn from stats_ext.issn_gold_oa_dataset_v5 + UNION ALL + SELECT issn as issn from stats_ext.issn_gold_oa_dataset_v5 + UNION ALL + select issn from stats_ext.alljournals where journal_is_in_doaj=true or journal_is_oa=true + UNION ALL + select issn_l as issn from stats_ext.alljournals where journal_is_in_doaj=true or journal_is_oa=true) foo), +dd as ( +select distinct * from ( + select id, issn_printed as issn from ${stats_db_name}.datasource d where d.id like '%doajarticles%' + UNION ALL + select id, issn_online as issn from ${stats_db_name}.datasource d where d.id like '%doajarticles%' + UNION ALL + select id, issn_printed as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_printed + UNION ALL + select id, issn_online as issn from ${stats_db_name}.datasource d join gold_oa on gold_oa.issn=d.issn_online) foo +) SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold FROM ${stats_db_name}.publication_datasources pd left outer join ( -select pd.id, 1 as is_gold FROM ${stats_db_name}.publication_datasources pd -JOIN issn on issn.id=pd.datasource -JOIN gold_oa on issn.issn = gold_oa.issn -join alljournals on issn.issn=alljournals.issn -left outer join ${stats_db_name}.result_instance ri on ri.id=pd.id -and ri.accessright!='Closed Access' and ri.accessright_uw='gold') tmp -on pd.id=tmp.id; + select pd.id, 1 as is_gold + FROM ${stats_db_name}.publication_datasources pd + join dd on dd.id=pd.datasource + left outer join ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; @@ -421,15 +469,26 @@ drop table if exists ${stats_db_name}.indi_pub_hybrid purge; -- where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp -- on pd.id=tmp.id; +--create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as +--select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication_datasources pd +--left outer join (select pd.id, 1 as is_hybrid from ${stats_db_name}.publication_datasources pd +--join ${stats_db_name}.datasource d on pd.datasource=d.id +--join ${stats_db_name}.result_instance ri on ri.id=pd.id +--join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=pd.id +--join ${stats_db_name}.result_accessroute ra on ra.id=pd.id +--where d.type like '%Journal%' and ri.accessright!='Closed Access' and (ri.accessright_uw!='gold' +--or indi_gold.is_gold=0) and (ra.accessroute='hybrid' or ri.license is not null)) tmp +--on pd.id=tmp.id; + create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as -select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication_datasources pd -left outer join (select pd.id, 1 as is_hybrid from ${stats_db_name}.publication_datasources pd -join ${stats_db_name}.datasource d on pd.datasource=d.id +select distinct pd.id,coalesce(is_hybrid,0) is_hybrid from ${stats_db_name}.publication pd +left outer join (select pd.id, 1 as is_hybrid from ${stats_db_name}.publication pd join ${stats_db_name}.result_instance ri on ri.id=pd.id join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=pd.id join ${stats_db_name}.result_accessroute ra on ra.id=pd.id -where d.type like '%Journal%' and ri.accessright!='Closed Access' and (ri.accessright_uw!='gold' -or indi_gold.is_gold=0) and (ra.accessroute='hybrid' or ri.license is not null)) tmp +join ${stats_db_name}.datasource d on d.id=ri.hostedby +where indi_gold.is_gold=0 and ((d.type like '%Journal%' and ri.accessright!='Closed Access' and ri.accessright!='Restricted' and ri.license is not null) or +ra.accessroute='hybrid'))tmp on pd.id=tmp.id; drop table if exists ${stats_db_name}.indi_org_fairness purge; @@ -814,14 +873,16 @@ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; --and ri.accessright='Open Access') tmp on tmp.id=p.id; create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as -select distinct pd.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication_datasources pd -left outer join (select pd.id, 1 as is_bronze_oa from ${stats_db_name}.publication_datasources pd -join ${stats_db_name}.datasource d on pd.datasource=d.id +select distinct pd.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication pd +left outer join (select pd.id, 1 as is_bronze_oa from ${stats_db_name}.publication pd join ${stats_db_name}.result_instance ri on ri.id=pd.id join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=pd.id +join ${stats_db_name}.indi_pub_hybrid indi_hybrid on indi_hybrid.id=pd.id join ${stats_db_name}.result_accessroute ra on ra.id=pd.id -where d.type like '%Journal%' and ri.accessright!='Closed Access' and (ri.accessright_uw!='gold' -or indi_gold.is_gold=0) and (ra.accessroute='bronze' or ri.license is null)) tmp +join ${stats_db_name}.datasource d on d.id=ri.hostedby +where indi_gold.is_gold=0 and indi_hybrid.is_hybrid=0 +and ((d.type like '%Journal%' and ri.accessright!='Closed Access' +and ri.accessright!='Restricted' and ri.license is null) or ra.accessroute='bronze')) tmp on pd.id=tmp.id; CREATE TEMPORARY TABLE ${stats_db_name}.project_year_result_year as diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index b52abd865..c61a19e5c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -64,6 +64,8 @@ create table TARGET.result_accessroute stored as parquet as select * from SOURCE create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); + create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index 2b6a68514..167aac726 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -248,6 +248,8 @@ create table TARGET.indi_impact_measures stored as parquet as select * from SOUR create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index eb97263a7..0384de4ec 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -5,6 +5,8 @@ ------------------------------------------------------ -- Dataset temporary table supporting updates +DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; + CREATE TABLE ${stats_db_name}.dataset_tmp ( id STRING, @@ -40,6 +42,8 @@ SELECT substr(d.id, 4) AS id, FROM ${openaire_db_name}.dataset d WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; + CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d @@ -47,12 +51,16 @@ FROM ${openaire_db_name}.dataset d WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; + CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; + CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id @@ -62,6 +70,8 @@ from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; + CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM ( @@ -74,23 +84,31 @@ FROM ( FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; + CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; + CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; + CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; + CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 0d1f6323e..d8f4d65e4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -5,6 +5,7 @@ -------------------------------------------------------- -- Software temporary table supporting updates +DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; CREATE TABLE ${stats_db_name}.software_tmp ( id STRING, @@ -40,6 +41,8 @@ SELECT substr(s.id, 4) as id, from ${openaire_db_name}.software s where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; + CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s @@ -47,6 +50,8 @@ FROM ${openaire_db_name}.software s where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; + CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p @@ -62,6 +67,8 @@ FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; + CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM ( @@ -74,23 +81,31 @@ FROM ( FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; + CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; + CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; + CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; + CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index 06b616d6a..fae0fbb63 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -5,6 +5,8 @@ -------------------------------------------------------------------------------- -- Otherresearchproduct temporary table supporting updates +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp ( id STRING, @@ -40,6 +42,8 @@ FROM ${openaire_db_name}.otherresearchproduct o WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; -- Otherresearchproduct_citations +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation @@ -51,6 +55,8 @@ SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id @@ -59,6 +65,8 @@ SELECT substr(p.id, 4) as id, case FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource @@ -68,21 +76,29 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; + CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index e0522e149..e5b3f504e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -3,29 +3,39 @@ -- Project table/view and Project related tables/views ------------------------------------------------------ ------------------------------------------------------ +DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; + CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; + CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.source like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; + CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.target like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; + create table ${stats_db_name}.project_classification STORED AS PARQUET as select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; +DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; + CREATE TABLE ${stats_db_name}.project_tmp ( id STRING, @@ -80,12 +90,16 @@ SELECT substr(p.id, 4) AS id, FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +DROP TABLE IF EXISTS ${stats_db_name}.funder purge; + create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; +DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; + CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, properties[0].value contribution, properties[1].value currency From a397112cb86a6f26798b18ce159837b032878fe1 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 1 Dec 2023 15:00:18 +0200 Subject: [PATCH 3/4] Add new indicator Add indi_pub_publicly_funded --- .../scripts/step16-createIndicatorsTables.sql | 21 +++++++++++++++++++ .../scripts/step20-createMonitorDB.sql | 1 + .../scripts/step20-createMonitorDBAll.sql | 1 + 3 files changed, 23 insertions(+) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index fea449de6..5aa14e2c2 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1177,3 +1177,24 @@ select allresults.ri_initiative, result_findable.no_result_findable/allresults.n from allresults join result_findable on result_findable.ri_initiative=allresults.ri_initiative; +create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as +with org_names_pids as +(select org.id,name, pid from ${stats_db_name}.organization org +join ${stats_db_name}.organization_pids op on org.id=op.id), +publicly_funded_orgs as +(select distinct name from +(select pf.name from stats_ext.insitutions_for_publicly_funded pf +join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government' +union all +select pf.name from stats_ext.insitutions_for_publicly_funded pf +join ${stats_db_name}.project p on p.funder=pf.name +union all +select pf.name from stats_ext.insitutions_for_publicly_funded pf +join org_names_pids op on (op.name=pf.name or op.pid=pf.ror) +and pf.publicly_funded='yes') foo) +select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded +from ${stats_db_name}.publication p +left outer join ( +select distinct ro.id, 1 as publicly_funded from result_organization ro +join ${stats_db_name}.organization o on o.id=ro.organization +join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index c61a19e5c..cc8348f26 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -63,6 +63,7 @@ create table TARGET.result_fos stored as parquet as select * from SOURCE.result_ create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index 167aac726..68417f3e0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -250,6 +250,7 @@ create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * f create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s; From c9d995dde0d48b56e99af43e4c91273817def678 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 1 Dec 2023 15:44:35 +0200 Subject: [PATCH 4/4] New institutions added --- .../stats/oozie_app/scripts/step20-createMonitorDBAll.sql | 5 ++++- .../scripts/step20-createMonitorDB_institutions.sql | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index 68417f3e0..42812d159 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -78,7 +78,10 @@ create table TARGET.result stored as parquet as 'openorgs____::4d4051b56708688235252f1d8fddb8c1', -- Iscte - Instituto Universitário de Lisboa 'openorgs____::5d55fb216b14691cf68218daf5d78cd9', -- Munster Technological University 'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University - 'openorgs____::8839b55dae0c84d56fd533f52d5d483a' -- Leibniz Institute of Ecological Urban and Regional Development + 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development + 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology + 'openorgs____::b5ca9d4340e26454e367e2908ef3872f' -- Alma Mater Studiorum University of Bologna + ) )) foo; create view if not exists TARGET.category as select * from SOURCE.category; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql index d2f08b391..2c0ac337c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql @@ -58,5 +58,7 @@ create table TARGET.result stored as parquet as 'openorgs____::4d4051b56708688235252f1d8fddb8c1', -- Iscte - Instituto Universitário de Lisboa 'openorgs____::5d55fb216b14691cf68218daf5d78cd9', -- Munster Technological University 'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University - 'openorgs____::8839b55dae0c84d56fd533f52d5d483a' -- Leibniz Institute of Ecological Urban and Regional Development + 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development + 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology + 'openorgs____::b5ca9d4340e26454e367e2908ef3872f' -- Alma Mater Studiorum University of Bologna ))) foo; \ No newline at end of file