From db33f7727cd1dc284ee7a2a9ca3435fe3dae69d6 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 15 Apr 2024 16:22:40 +0300 Subject: [PATCH 01/19] Update "dhp-stats-update" workflow to use "spark"-actions, instead of "hive" ones. Note: Currently the code is set to only test the "Step1". --- .../dhp-stats-update/installProject.sh | 18 + .../dhp-stats-update/runOozieWorkfow.sh | 20 + .../graph/stats/oozie_app/scripts/step1.sql | 4 +- .../graph/stats/oozie_app/scripts/step10.sql | 24 +- .../graph/stats/oozie_app/scripts/step11.sql | 18 +- .../graph/stats/oozie_app/scripts/step12.sql | 34 +- .../graph/stats/oozie_app/scripts/step13.sql | 34 +- .../graph/stats/oozie_app/scripts/step14.sql | 30 +- .../graph/stats/oozie_app/scripts/step15.sql | 26 +- .../stats/oozie_app/scripts/step15_5.sql | 36 +- .../scripts/step16_1-definitions.sql | 12 +- .../stats/oozie_app/scripts/step16_5.sql | 20 +- .../graph/stats/oozie_app/scripts/step2.sql | 38 +- .../scripts/step21-createObservatoryDB.sql | 38 +- .../graph/stats/oozie_app/scripts/step3.sql | 38 +- .../graph/stats/oozie_app/scripts/step4.sql | 36 +- .../graph/stats/oozie_app/scripts/step5.sql | 36 +- .../graph/stats/oozie_app/scripts/step6.sql | 30 +- .../graph/stats/oozie_app/scripts/step7.sql | 30 +- .../graph/stats/oozie_app/scripts/step8.sql | 36 +- .../graph/stats/oozie_app/scripts/step9.sql | 8 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 443 +++++++++++++----- 22 files changed, 627 insertions(+), 382 deletions(-) create mode 100755 dhp-workflows/dhp-stats-update/installProject.sh create mode 100755 dhp-workflows/dhp-stats-update/runOozieWorkfow.sh diff --git a/dhp-workflows/dhp-stats-update/installProject.sh b/dhp-workflows/dhp-stats-update/installProject.sh new file mode 100755 index 0000000000..afd95578da --- /dev/null +++ b/dhp-workflows/dhp-stats-update/installProject.sh @@ -0,0 +1,18 @@ +# Install the whole "dnet-hadoop" project. + +# Delete this module's previous build-files in order to avoid any conflicts. +rm -rf target/ || + +# Go to the root directory of this project. +cd ../../ + +# Select the build profile. +DEFAULT_PROFILE='' # It's the empty profile. +NEWER_VERSIONS_PROFILE='-Pscala-2.12' +CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} + +# Install the project. +mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true + +# We skip tests for all modules, since the take a big amount of time and some of them fail. +# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script. diff --git a/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh b/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh new file mode 100755 index 0000000000..a4825a3aea --- /dev/null +++ b/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh @@ -0,0 +1,20 @@ +# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file. + +# Select the build profile. +DEFAULT_PROFILE='' # It's the empty profile. +NEWER_VERSIONS_PROFILE='-Pscala-2.12' +CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE} + +# Build and deploy this module. +mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \ + -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats + +# Show the Oozie-job-ID. +echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n" +cat ./target/extract-and-run-on-remote-host.log + +# Check oozie workflow status +# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info + +# Get the from the previous output and check the logs: +# yarn logs -applicationId diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql index 9697a1dc8d..467a98872c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql @@ -4,5 +4,5 @@ -------------------------------------------------------------- -------------------------------------------------------------- -DROP database IF EXISTS ${stats_db_name} CASCADE; -CREATE database ${stats_db_name}; +DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/ +CREATE database ${stats_db_name}; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index bbd7b3bbcd..9088ce2052 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -5,27 +5,27 @@ ------------------------------------------------------------------------------------------------ CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS SELECT * -FROM ${external_stats_db_name}.fundref; +FROM ${external_stats_db_name}.fundref; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.country AS SELECT * -FROM ${external_stats_db_name}.country; +FROM ${external_stats_db_name}.country; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS SELECT * -FROM ${external_stats_db_name}.countrygdp; +FROM ${external_stats_db_name}.countrygdp; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS SELECT * -FROM ${external_stats_db_name}.roarmap; +FROM ${external_stats_db_name}.roarmap; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * -FROM ${external_stats_db_name}.rndexpediture; +FROM ${external_stats_db_name}.rndexpediture; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS SELECT * -FROM ${external_stats_db_name}.licenses_normalized; +FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/ ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ @@ -33,23 +33,23 @@ FROM ${external_stats_db_name}.licenses_normalized; ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ create or replace view ${stats_db_name}.usage_stats as -select * from openaire_prod_usage_stats.usage_stats; +select * from openaire_prod_usage_stats.usage_stats; /*EOS*/ create or replace view ${stats_db_name}.downloads_stats as -select * from openaire_prod_usage_stats.downloads_stats; +select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/ create or replace view ${stats_db_name}.pageviews_stats as -select * from openaire_prod_usage_stats.pageviews_stats; +select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/ create or replace view ${stats_db_name}.views_stats as -select * from openaire_prod_usage_stats.views_stats; +select * from openaire_prod_usage_stats.views_stats; /*EOS*/ ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; +DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/ create table ${stats_db_name}.creation_date STORED AS PARQUET as -select date_format(current_date(), 'dd-MM-yyyy') as date; +select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 638fb0f7a9..06600db192 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -10,7 +10,7 @@ SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd - WHERE d.id = rd.datasource); + WHERE d.id = rd.datasource); /*EOS*/ -- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables UPDATE ${stats_db_name}.project_tmp @@ -19,8 +19,8 @@ WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result = r.id - AND r.type = 'publication'); -DROP TABLE IF EXISTS ${stats_db_name}.stored purge; + AND r.type = 'publication'); /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.stored purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project stored as parquet as SELECT p.id, @@ -63,7 +63,7 @@ FROM ${stats_db_name}.project_tmp p AND r.type = 'publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 GROUP BY pp.id) AS prr2 - ON prr2.id = p.id; + ON prr2.id = p.id; /*EOS*/ UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' @@ -73,7 +73,7 @@ WHERE publication_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); + AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' @@ -83,7 +83,7 @@ WHERE dataset_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); + AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' @@ -93,7 +93,7 @@ WHERE software_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); + AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' @@ -103,7 +103,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); + AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, @@ -116,4 +116,4 @@ FROM ${stats_db_name}.result_projects, ${stats_db_name}.project WHERE result_projects.id = result.id AND result.type = 'publication' - AND project.id = result_projects.project; \ No newline at end of file + AND project.id = result_projects.project; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 0a1904de7b..ff95524be8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -1,42 +1,42 @@ ------------------------------------------------------------------------------------------------------ -- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables ------------------------------------------------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS SELECT * -FROM ${stats_db_name}.datasource_tmp; +FROM ${stats_db_name}.datasource_tmp; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication stored AS parquet AS SELECT * -FROM ${stats_db_name}.publication_tmp; +FROM ${stats_db_name}.publication_tmp; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS SELECT * -FROM ${stats_db_name}.dataset_tmp; +FROM ${stats_db_name}.dataset_tmp; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software purge; +DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software stored AS parquet AS SELECT * -FROM ${stats_db_name}.software_tmp; +FROM ${stats_db_name}.software_tmp; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS SELECT * -FROM ${stats_db_name}.otherresearchproduct_tmp; +FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ -DROP TABLE ${stats_db_name}.project_tmp; -DROP TABLE ${stats_db_name}.datasource_tmp; -DROP TABLE ${stats_db_name}.publication_tmp; -DROP TABLE ${stats_db_name}.dataset_tmp; -DROP TABLE ${stats_db_name}.software_tmp; -DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; +DROP TABLE ${stats_db_name}.project_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.datasource_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.publication_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.dataset_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.software_tmp; /*EOS*/ +DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ ---------------------------------------------- -- Re-creating views from final parquet tables @@ -54,4 +54,4 @@ SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.otherresearchproduct; +FROM ${stats_db_name}.otherresearchproduct; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 8c1dbdc4dd..68a46ded33 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -5,7 +5,7 @@ -- Sources related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -16,9 +16,9 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -29,9 +29,9 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -42,9 +42,9 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -55,7 +55,7 @@ LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS SELECT * FROM ${stats_db_name}.publication_sources @@ -64,9 +64,9 @@ SELECT * FROM ${stats_db_name}.dataset_sources UNION ALL SELECT * FROM ${stats_db_name}.software_sources UNION ALL -SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; +SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid @@ -76,9 +76,9 @@ from ( LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type - WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; + WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype @@ -91,9 +91,9 @@ where reltype='resultResult' and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' - and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as select substr(target, 4) as id, count(distinct substr(source, 4)) as citations @@ -108,9 +108,9 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE -group by substr(target, 4); +group by substr(target, 4); /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as select substr(source, 4) as id, count(distinct substr(target, 4)) as references @@ -125,4 +125,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE -group by substr(source, 4); \ No newline at end of file +group by substr(source, 4); /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index f50c13521b..f61c702219 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -5,33 +5,33 @@ -- Licences related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS SELECT substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses -where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; +where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses @@ -40,15 +40,15 @@ SELECT * FROM ${stats_db_name}.dataset_licenses UNION ALL SELECT * FROM ${stats_db_name}.software_licenses UNION ALL -SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; +SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid -from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; +from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource @@ -58,10 +58,10 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; + WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result -lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; +lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index ce6b6cc2fc..7c618fd0f2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -6,7 +6,7 @@ ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -22,9 +22,9 @@ from ( union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -40,9 +40,9 @@ from ( union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -58,9 +58,9 @@ from ( union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as with peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed @@ -76,7 +76,7 @@ from ( union all select non_peer_reviewed.* from non_peer_reviewed left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id - where peer_reviewed.id is null) pr; + where peer_reviewed.id is null) pr; /*EOS*/ CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as select * from ${stats_db_name}.publication_refereed @@ -85,17 +85,17 @@ select * from ${stats_db_name}.dataset_refereed union all select * from ${stats_db_name}.software_refereed union all -select * from ${stats_db_name}.otherresearchproduct_refereed; +select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; +DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids -where measures_ids.id!='views' and measures_ids.id!='downloads'; +where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/ create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, @@ -104,4 +104,4 @@ rel.properties[1].value apc_currency from ${openaire_db_name}.relation rel join ${openaire_db_name}.organization o on o.id=rel.source join ${openaire_db_name}.result r on r.id=rel.target -where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; \ No newline at end of file +where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 6ed686a050..54743e046e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -1,25 +1,25 @@ ------------------------------------------- --- Extra tables, mostly used by indicators -DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/ create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as select r.id, count(distinct p.id) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project -group by r.id; +group by r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/ create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as select r.id, count(distinct p.funder) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project -group by r.id; +group by r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/ create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as with rcount as ( @@ -33,17 +33,17 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els sum(case when rcount.type='software' then rcount.count else 0 end) as software, sum(case when rcount.type='other' then rcount.count else 0 end) as other from rcount -group by rcount.pid; +group by rcount.pid; /*EOS*/ -create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; -create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; -create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; -create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; -create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; -create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; -create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; +create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/ +create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/ +create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/ +create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/ +create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/ +create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/ +create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/ create table if not exists ${stats_db_name}.result_instance stored as parquet as select distinct r.* @@ -51,9 +51,9 @@ from ( select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r -join ${stats_db_name}.result res on res.id=r.id; +join ${stats_db_name}.result res on res.id=r.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/ create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as select distinct r.id, r.amount, r.currency @@ -61,6 +61,6 @@ from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id -where r.amount is not null; +where r.amount is not null; /*EOS*/ -create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; \ No newline at end of file +create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index b55af13d43..399381b125 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -3,26 +3,26 @@ ---------------------------------------------------- -- Peer reviewed: -drop table if exists ${stats_db_name}.result_peerreviewed purge; +drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id -left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; +left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/ -- Green OA: -drop table if exists ${stats_db_name}.result_greenoa purge; +drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as select r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r -left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; +left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/ -- GOLD OA: -drop table if exists ${stats_db_name}.result_gold purge; +drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as select r.id, case when gold.is_gold=1 then true else false end as gold from ${stats_db_name}.result r - left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file + left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index 7faa916970..1b838ca1ba 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -1,6 +1,6 @@ -- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, -- peer reviewed) -drop table if exists ${stats_db_name}.result_tmp; +drop table if exists ${stats_db_name}.result_tmp; /*EOS*/ CREATE TABLE ${stats_db_name}.result_tmp ( id STRING, @@ -20,37 +20,37 @@ CREATE TABLE ${stats_db_name}.result_tmp ( peer_reviewed BOOLEAN, green BOOLEAN, gold BOOLEAN) -clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); +clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); /*EOS*/ insert into ${stats_db_name}.result_tmp select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold FROM ${stats_db_name}.publication r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ insert into ${stats_db_name}.result_tmp select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold FROM ${stats_db_name}.dataset r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ insert into ${stats_db_name}.result_tmp select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold FROM ${stats_db_name}.software r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ insert into ${stats_db_name}.result_tmp select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold FROM ${stats_db_name}.otherresearchproduct r LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ -drop table if exists ${stats_db_name}.result; -drop view if exists ${stats_db_name}.result; -create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; -drop table ${stats_db_name}.result_tmp; \ No newline at end of file +drop table if exists ${stats_db_name}.result; /*EOS*/ +drop view if exists ${stats_db_name}.result; /*EOS*/ +create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; /*EOS*/ +drop table ${stats_db_name}.result_tmp; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 8e56f98fc4..4aa90b1a24 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -5,7 +5,7 @@ -------------------------------------------------------------- -- Publication temporary table -DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_tmp ( id STRING, @@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.publication_tmp abstract BOOLEAN, type STRING ) - clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); + clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, @@ -39,17 +39,17 @@ SELECT substr(p.id, 4) as id, case when size(p.description) > 0 then true else false end as abstract, 'publication' as type from ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case @@ -58,9 +58,9 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource @@ -71,44 +71,44 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file + and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 66620ac388..adcf23b7ad 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -5,7 +5,7 @@ from ${stats_db_name}.result r select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count from ${stats_db_name}.result_licenses rl group by rl.id -) rln on rln.id=r.id; +) rln on rln.id=r.id; /*EOS*/ create table ${observatory_db_name}.result_affiliated_country stored as parquet as @@ -35,7 +35,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_year stored as parquet as @@ -65,7 +65,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/ create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as @@ -95,7 +95,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as @@ -127,7 +127,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select @@ -158,7 +158,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select @@ -187,7 +187,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select @@ -216,7 +216,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select @@ -247,7 +247,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/ create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select @@ -278,7 +278,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_country stored as parquet as select @@ -309,7 +309,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_year stored as parquet as select @@ -340,7 +340,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/ create table ${observatory_db_name}.result_deposited_year_country stored as parquet as @@ -372,7 +372,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select @@ -403,7 +403,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select @@ -434,7 +434,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_organization stored as parquet as select @@ -465,7 +465,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select @@ -496,7 +496,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_funder stored as parquet as select @@ -529,7 +529,7 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/ create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select @@ -562,4 +562,4 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 0384de4ec9..1ff4beadb1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -5,7 +5,7 @@ ------------------------------------------------------ -- Dataset temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_tmp ( @@ -23,7 +23,7 @@ CREATE TABLE ${stats_db_name}.dataset_tmp abstract BOOLEAN, type STRING ) - clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); + clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.dataset_tmp SELECT substr(d.id, 4) AS id, @@ -40,26 +40,26 @@ SELECT substr(d.id, 4) AS id, CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, 'dataset' AS type FROM ${openaire_db_name}.dataset d -WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; +WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; + and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case @@ -68,9 +68,9 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource @@ -82,35 +82,35 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index d8f4d65e42..426d537739 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -5,7 +5,7 @@ -------------------------------------------------------- -- Software temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_tmp ( id STRING, @@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.software_tmp abstract BOOLEAN, type STRING ) - clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); + clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.software_tmp SELECT substr(s.id, 4) as id, @@ -39,24 +39,24 @@ SELECT substr(s.id, 4) as id, CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, 'software' as type from ${openaire_db_name}.software s -where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; +where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; + and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case @@ -65,9 +65,9 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource @@ -79,35 +79,35 @@ FROM ( LEFT OUTER JOIN ( SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index fae0fbb63c..6b5adff9d6 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -5,7 +5,7 @@ -------------------------------------------------------------------------------- -- Otherresearchproduct temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp ( @@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp source STRING, abstract BOOLEAN, type STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.otherresearchproduct_tmp SELECT substr(o.id, 4) AS id, @@ -39,23 +39,23 @@ SELECT substr(o.id, 4) AS id, CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, 'other' AS type FROM ${openaire_db_name}.otherresearchproduct o -WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; +WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EOS*/ -- Otherresearchproduct_citations -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; + and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case @@ -63,9 +63,9 @@ SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource @@ -74,32 +74,32 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p LEFT OUTER JOIN(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; + WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; \ No newline at end of file +where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 165f779468..75ec7d69c8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -3,38 +3,38 @@ -- Project table/view and Project related tables/views ------------------------------------------------------ ------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids -where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; +where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.source like '40|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.target like '40|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/ create table ${stats_db_name}.project_classification STORED AS PARQUET as select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class -where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; +where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_tmp ( @@ -61,7 +61,7 @@ CREATE TABLE ${stats_db_name}.project_tmp totalcost FLOAT, fundedamount FLOAT, currency STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ INSERT INTO ${stats_db_name}.project_tmp SELECT substr(p.id, 4) AS id, @@ -88,18 +88,18 @@ SELECT substr(p.id, 4) AS id, p.fundedamount AS fundedamount, p.currency.value AS currency FROM ${openaire_db_name}.project p -WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; +WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.funder purge; +DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/ create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname, xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; +DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, @@ -107,4 +107,4 @@ properties[0].value contribution, properties[1].value currency from ${openaire_db_name}.relation r LATERAL VIEW explode (r.properties) properties where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' -and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; \ No newline at end of file +and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index eb16a161e9..2cc7c13c44 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -16,7 +16,7 @@ SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset_tmp UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.otherresearchproduct_tmp; +FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ -- Views on final tables CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS @@ -30,7 +30,7 @@ SELECT * FROM ${stats_db_name}.dataset_datasources UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_datasources; +FROM ${stats_db_name}.otherresearchproduct_datasources; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS SELECT * @@ -43,7 +43,7 @@ SELECT * FROM ${stats_db_name}.dataset_citations UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_citations; +FROM ${stats_db_name}.otherresearchproduct_citations; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS SELECT * @@ -56,7 +56,7 @@ SELECT * FROM ${stats_db_name}.dataset_classifications UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_classifications; +FROM ${stats_db_name}.otherresearchproduct_classifications; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS SELECT * @@ -69,7 +69,7 @@ SELECT * FROM ${stats_db_name}.dataset_concepts UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_concepts; +FROM ${stats_db_name}.otherresearchproduct_concepts; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS SELECT * @@ -82,7 +82,7 @@ SELECT * FROM ${stats_db_name}.dataset_languages UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_languages; +FROM ${stats_db_name}.otherresearchproduct_languages; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS SELECT * @@ -95,7 +95,7 @@ SELECT * FROM ${stats_db_name}.dataset_oids UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_oids; +FROM ${stats_db_name}.otherresearchproduct_oids; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS SELECT * @@ -108,7 +108,7 @@ SELECT * FROM ${stats_db_name}.dataset_pids UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_pids; +FROM ${stats_db_name}.otherresearchproduct_pids; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS SELECT * @@ -121,9 +121,9 @@ SELECT * FROM ${stats_db_name}.dataset_topics UNION ALL SELECT * -FROM ${stats_db_name}.otherresearchproduct_topics; +FROM ${stats_db_name}.otherresearchproduct_topics; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; /*EOS*/ create table ${stats_db_name}.result_fos stored as parquet as with @@ -133,22 +133,22 @@ with select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 from lvl1 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) - join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); + join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' and r.target like '50|%' - and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; + and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; +DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result - JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; + JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 07204db0c1..3f40dbb215 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -5,7 +5,7 @@ -- Datasource table/view and Datasource related tables/views ------------------------------------------------------------ ------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_tmp ( @@ -22,7 +22,7 @@ CREATE TABLE ${stats_db_name}.datasource_tmp `compatibility` STRING, issn_printed STRING, issn_online STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ -- Insert statement that takes into account the piwik_id of the openAIRE graph INSERT INTO ${stats_db_name}.datasource_tmp @@ -46,14 +46,14 @@ FROM ${openaire_db_name}.datasource d1 LATERAL VIEW EXPLODE(originalid) temp AS originalidd WHERE originalidd like "piwik:%") AS d2 ON d1.id = d2.id -WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; +WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; /*EOS*/ -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); +CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); /*EOS*/ -INSERT INTO ${stats_db_name}.dual VALUES ('X'); +INSERT INTO ${stats_db_name}.dual VALUES ('X'); /*EOS*/ INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) @@ -71,42 +71,42 @@ SELECT 'other', null, null FROM ${stats_db_name}.dual -WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); -DROP TABLE ${stats_db_name}.dual; +WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); /*EOS*/ +DROP TABLE ${stats_db_name}.dual; /*EOS*/ -UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; -UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; +UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; /*EOS*/ +UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/ -- datasource sources: -- where the datasource info have been collected from. -DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; +DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/ create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf -where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; +where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; +FROM ${stats_db_name}.result_datasources; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index 19d301e276..afde8160ef 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -3,7 +3,7 @@ -- Organization table/view and Organization related tables/views ---------------------------------------------------------------- ---------------------------------------------------------------- -DROP TABLE IF EXISTS ${stats_db_name}.organization purge; +DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS SELECT substr(o.id, 4) as id, @@ -11,12 +11,12 @@ SELECT substr(o.id, 4) as id, o.legalshortname.value as legalshortname, o.country.classid as country FROM ${openaire_db_name}.organization o -WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; +WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource -FROM ${stats_db_name}.datasource_organizations; +FROM ${stats_db_name}.datasource_organizations; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id -FROM ${stats_db_name}.project_organizations; \ No newline at end of file +FROM ${stats_db_name}.project_organizations; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 813fffcf9f..1460477aec 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -154,180 +154,354 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - - + + yarn + cluster + Step1 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + + + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step2 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step3 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step4 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step6 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step7 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step8 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step9 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step10 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step11 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step12 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step13 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step14 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step15 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - + + yarn + cluster + Step15_5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + @@ -379,23 +553,45 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step16_1-definitions + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - + + yarn + cluster + Step16_5 + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + @@ -461,12 +657,23 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - observatory_db_name=${observatory_db_name} - + + yarn + cluster + Step21-createObservatoryDB + eu.dnetlib.dhp.oozie.RunSQLSparkJob + dhp-stats-update-${projectVersion}.jar + + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} + + --hiveMetastoreUris${hive_metastore_uris} + --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql + --stats_db_name${stats_db_name} + --openaire_db_name${openaire_db_name} + From 0b897f2f667a2bbbf9cb57af5ea4651c01a438c6 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 16 Apr 2024 18:17:54 +0300 Subject: [PATCH 02/19] Fix and add missing "DROP TABLE" statements, in "dhp-stats-update" sql-scripts. --- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql | 3 ++- .../stats/oozie_app/scripts/step16-createIndicatorsTables.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql | 4 +++- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql | 2 ++ 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 06600db192..f4d06587bc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -20,7 +20,8 @@ WHERE project_tmp.id IN (SELECT pr.id ${stats_db_name}.result r WHERE pr.result = r.id AND r.type = 'publication'); /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.stored purge; /*EOS*/ + +DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project stored as parquet as SELECT p.id, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 70cde64815..c2231d54ce 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -965,6 +965,8 @@ select allresults.ri_initiative, result_findable.no_result_findable/allresults.n from allresults join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/ +drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/ + create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as with org_names_pids as (select org.id,name, pid from ${stats_db_name}.organization org diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 426d537739..0cffff052f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -58,6 +58,8 @@ FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.software_concepts purge; /*EOS*/ + CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS SELECT substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index 6b5adff9d6..d742bcc2a7 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -50,6 +50,8 @@ FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_classifications purge; /*EOS*/ + CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype @@ -72,7 +74,7 @@ SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS dataso FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p - LEFT OUTER JOIN(SELECT substr(d.id, 4) id + LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 3f40dbb215..23fa743f90 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -51,6 +51,8 @@ WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; /* -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert +DROP TABLE IF EXISTS ${stats_db_name}.dual purge; /*EOS*/ + CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); /*EOS*/ INSERT INTO ${stats_db_name}.dual VALUES ('X'); /*EOS*/ From ca091c0f1e578d57467d9e1c496050420e6d8175 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 17 Apr 2024 14:03:59 +0300 Subject: [PATCH 03/19] dhp-stats-update: - Fix not passing some parameters to some Spark actions. - Allow the workflow to run up to Step7. The first 7 steps seem to work out of the box. --- dhp-workflows/dhp-stats-update/runOozieWorkfow.sh | 2 +- .../dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh b/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh index a4825a3aea..2f2fc29d51 100755 --- a/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh +++ b/dhp-workflows/dhp-stats-update/runOozieWorkfow.sh @@ -17,4 +17,4 @@ cat ./target/extract-and-run-on-remote-host.log # oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info # Get the from the previous output and check the logs: -# yarn logs -applicationId +# yarn logs -applicationId application_ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 1460477aec..c2c6f98229 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -171,8 +171,7 @@ --stats_db_name${stats_db_name} --openaire_db_name${openaire_db_name} - - + @@ -304,7 +303,8 @@ --stats_db_name${stats_db_name} --openaire_db_name${openaire_db_name} - + + @@ -369,6 +369,7 @@ --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql --stats_db_name${stats_db_name} --openaire_db_name${openaire_db_name} + --external_stats_db_name${external_stats_db_name} @@ -391,6 +392,7 @@ --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql --stats_db_name${stats_db_name} --openaire_db_name${openaire_db_name} + --external_stats_db_name${external_stats_db_name} @@ -501,6 +503,7 @@ --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql --stats_db_name${stats_db_name} --openaire_db_name${openaire_db_name} + --external_stats_db_name${external_stats_db_name} @@ -672,7 +675,7 @@ --hiveMetastoreUris${hive_metastore_uris} --sqleu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql --stats_db_name${stats_db_name} - --openaire_db_name${openaire_db_name} + --observatory_db_name${observatory_db_name} From 6f2ebb2a52fa99735e92867ffdfbd701926f3dd8 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 18 Apr 2024 15:35:03 +0300 Subject: [PATCH 04/19] Revert Step8 and Step11 to use Hive again, since their "UPDATE" statements are not supported by Spark. --- .../graph/stats/oozie_app/scripts/step11.sql | 18 +++---- .../graph/stats/oozie_app/scripts/step8.sql | 37 +++++++------- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 49 ++++++++++++++----- 3 files changed, 64 insertions(+), 40 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index f4d06587bc..207c1b1243 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -10,7 +10,7 @@ SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd - WHERE d.id = rd.datasource); /*EOS*/ + WHERE d.id = rd.datasource); -- /*EOS*/ -- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables UPDATE ${stats_db_name}.project_tmp @@ -19,9 +19,9 @@ WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result = r.id - AND r.type = 'publication'); /*EOS*/ + AND r.type = 'publication'); -- /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.project purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.project stored as parquet as SELECT p.id, @@ -64,7 +64,7 @@ FROM ${stats_db_name}.project_tmp p AND r.type = 'publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 GROUP BY pp.id) AS prr2 - ON prr2.id = p.id; /*EOS*/ + ON prr2.id = p.id; -- /*EOS*/ UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' @@ -74,7 +74,7 @@ WHERE publication_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ + AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' @@ -84,7 +84,7 @@ WHERE dataset_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ + AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' @@ -94,7 +94,7 @@ WHERE software_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ + AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' @@ -104,7 +104,7 @@ WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id ${stats_db_name}.project_tmp p WHERE r.id = pr.result AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); /*EOS*/ + AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, @@ -117,4 +117,4 @@ FROM ${stats_db_name}.result_projects, ${stats_db_name}.project WHERE result_projects.id = result.id AND result.type = 'publication' - AND project.id = result_projects.project; /*EOS*/ \ No newline at end of file + AND project.id = result_projects.project; -- /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 23fa743f90..07e19d68b8 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -5,7 +5,7 @@ -- Datasource table/view and Datasource related tables/views ------------------------------------------------------------ ------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_tmp ( @@ -23,6 +23,7 @@ CREATE TABLE ${stats_db_name}.datasource_tmp issn_printed STRING, issn_online STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -- /*EOS*/ -- Insert statement that takes into account the piwik_id of the openAIRE graph INSERT INTO ${stats_db_name}.datasource_tmp @@ -46,16 +47,16 @@ FROM ${openaire_db_name}.datasource d1 LATERAL VIEW EXPLODE(originalid) temp AS originalidd WHERE originalidd like "piwik:%") AS d2 ON d1.id = d2.id -WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; /*EOS*/ +WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- /*EOS*/ -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -DROP TABLE IF EXISTS ${stats_db_name}.dual purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.dual purge; -- /*EOS*/ -CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); /*EOS*/ +CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); -- /*EOS*/ -INSERT INTO ${stats_db_name}.dual VALUES ('X'); /*EOS*/ +INSERT INTO ${stats_db_name}.dual VALUES ('X'); -- /*EOS*/ INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) @@ -73,42 +74,42 @@ SELECT 'other', null, null FROM ${stats_db_name}.dual -WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); /*EOS*/ -DROP TABLE ${stats_db_name}.dual; /*EOS*/ +WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); -- /*EOS*/ +DROP TABLE ${stats_db_name}.dual; -- /*EOS*/ -UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; /*EOS*/ -UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; /*EOS*/ +UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; -- /*EOS*/ +UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; -- /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; -- /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/ +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; -- /*EOS*/ -- datasource sources: -- where the datasource info have been collected from. -DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; -- /*EOS*/ create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf -where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ +where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -- /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; /*EOS*/ +FROM ${stats_db_name}.result_datasources; -- /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index c2c6f98229..5c255a488c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -308,7 +308,7 @@ - + + + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + + + @@ -375,7 +386,7 @@ - + + + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + external_stats_db_name=${external_stats_db_name} + + + From d46b78b65949a58447821e050ed0179a8173c404 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 18 Apr 2024 15:40:27 +0300 Subject: [PATCH 05/19] dhp-stats-update: - Set Steps 2-7 and 9 to limit the amount of files generated by Spark, from 8000, down to 100, to improve file-transfer and querying performance. - Allow the workflow to run up to Step10. The Step11 seems to have some issues even when using hive-action. --- .../oa/graph/stats/oozie_app/scripts/step2.sql | 16 ++++++++-------- .../oa/graph/stats/oozie_app/scripts/step3.sql | 16 ++++++++-------- .../oa/graph/stats/oozie_app/scripts/step4.sql | 16 ++++++++-------- .../oa/graph/stats/oozie_app/scripts/step5.sql | 16 ++++++++-------- .../oa/graph/stats/oozie_app/scripts/step6.sql | 12 ++++++------ .../oa/graph/stats/oozie_app/scripts/step7.sql | 9 ++++----- .../oa/graph/stats/oozie_app/scripts/step9.sql | 2 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 6 +++--- 8 files changed, 46 insertions(+), 47 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 4aa90b1a24..8ec663573e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -44,7 +44,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, instancetype.classname as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -52,7 +52,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept @@ -63,7 +63,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance @@ -76,14 +76,14 @@ FROM ( DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS -select substr(p.id, 4) as id, p.language.classname as language +select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -91,7 +91,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -99,7 +99,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as -select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic +select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -107,7 +107,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 1ff4beadb1..ebedb5dc5a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -45,7 +45,7 @@ WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS -SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" @@ -54,7 +54,7 @@ WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -62,7 +62,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept @@ -73,7 +73,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS -SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM ( SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource FROM ${openaire_db_name}.dataset p @@ -87,14 +87,14 @@ FROM ( DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, p.language.classname AS language +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -102,7 +102,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -110,7 +110,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 0cffff052f..4957d8d2f2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -44,7 +44,7 @@ where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS -SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" @@ -53,7 +53,7 @@ where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -61,7 +61,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept @@ -72,7 +72,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS -SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM ( SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource FROM ${openaire_db_name}.software p @@ -86,14 +86,14 @@ FROM ( DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS -select substr(p.id, 4) AS id, p.language.classname AS language +select /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -101,7 +101,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ @@ -109,7 +109,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index d742bcc2a7..820ec43959 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -45,7 +45,7 @@ WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS -SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites +SELECT /*+ COALESCE(100) */ substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/ @@ -53,14 +53,14 @@ WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_classifications purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, instancetype.classname AS type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, case +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept @@ -70,7 +70,7 @@ where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS -SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource +SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p @@ -81,27 +81,27 @@ FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) A DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, p.language.classname AS language +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/ CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 75ec7d69c8..d2688ec073 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -6,14 +6,14 @@ DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS -SELECT substr(p.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS -SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization +SELECT /*+ COALESCE(100) */ substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype = 'projectOrganization' and r.source like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ @@ -21,7 +21,7 @@ WHERE r.reltype = 'projectOrganization' and r.source like '40|%' DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance +SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.target like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/ @@ -29,7 +29,7 @@ WHERE r.reltype = 'resultProject' and r.target like '40|%' DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/ create table ${stats_db_name}.project_classification STORED AS PARQUET as -select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 +select /*+ COALESCE(100) */ substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/ @@ -93,7 +93,7 @@ WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EO DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/ create table ${stats_db_name}.funder STORED AS PARQUET as -select distinct xpath_string(fund, '//funder/id') as id, +select /*+ COALESCE(100) */ distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname, xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country @@ -102,7 +102,7 @@ from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fun DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/ CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS -SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, +SELECT /*+ COALESCE(100) */ distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, properties[0].value contribution, properties[1].value currency from ${openaire_db_name}.relation r LATERAL VIEW explode (r.properties) properties diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 2cc7c13c44..f3ab520041 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -130,7 +130,7 @@ with lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification') -select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 +select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 from lvl1 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); /*EOS*/ @@ -138,7 +138,7 @@ from lvl1 DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization +SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' and r.target like '50|%' @@ -147,8 +147,7 @@ WHERE r.reltype = 'resultOrganization' DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS -select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance +select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result - JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/ - + JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index afde8160ef..1d76b89a66 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -6,7 +6,7 @@ DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS -SELECT substr(o.id, 4) as id, +SELECT /*+ COALESCE(100) */ substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname.value as legalshortname, o.country.classid as country diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 5c255a488c..37d837e765 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -303,8 +303,7 @@ --stats_db_name${stats_db_name} --openaire_db_name${openaire_db_name} - - + @@ -382,7 +381,8 @@ --openaire_db_name${openaire_db_name} --external_stats_db_name${external_stats_db_name} - + + From 2616971e2bc23ee172ffc64e0a6730ed1356f9a8 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 18 Apr 2024 16:18:16 +0300 Subject: [PATCH 06/19] dhp-stats-update: remove leftover duplicate line --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 07e19d68b8..90c3ebef6a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -22,7 +22,6 @@ CREATE TABLE ${stats_db_name}.datasource_tmp `compatibility` STRING, issn_printed STRING, issn_online STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -- /*EOS*/ -- Insert statement that takes into account the piwik_id of the openAIRE graph From 888637773cfc4d076d27d5fc83bd1b5008f521c2 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 27 May 2024 12:34:49 +0300 Subject: [PATCH 07/19] Add missing "/*EOS*/" comments. --- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql | 2 +- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql | 2 +- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql | 2 +- .../oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql | 2 +- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql | 2 +- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 9a9a507e37..8cab942e67 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -1,4 +1,4 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ ------------------------------------------------------ ------------------------------------------------------ -- Additional relations diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 989b92268b..1f3027b7df 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -1,4 +1,4 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ ------------------------------------------------------ ------------------------------------------------------ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index be5d42f968..d18cf569fd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -1,4 +1,4 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ ------------------------------------------- --- Extra tables, mostly used by indicators diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index dd830a24d4..0da4394c84 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -1,4 +1,4 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ ---------------------------------------------------- -- Shortcuts for various definitions in stats db --- diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index e723ec8b16..416298e4c4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -1,4 +1,4 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ -- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, -- peer reviewed) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 59213c4d58..f0e5ce0910 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -1,4 +1,4 @@ -set mapred.job.queue.name=analytics; +set mapred.job.queue.name=analytics; /*EOS*/ -------------------------------------------------------------- -------------------------------------------------------------- @@ -113,4 +113,4 @@ SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; + and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ From a644a6f4fef8a1d2dfdc21c9edaa463a24855d77 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 29 May 2024 12:10:11 +0300 Subject: [PATCH 08/19] Catch Spark-sql errors and show a log with the statement that failed. --- .../main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java index 027bf0735d..01d1b9f6ab 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oozie/RunSQLSparkJob.java @@ -65,7 +65,13 @@ public class RunSQLSparkJob { for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) { log.info("executing: {}", statement); long startTime = System.currentTimeMillis(); - spark.sql(statement).show(); + try { + spark.sql(statement).show(); + } catch (Exception e) { + log.error("Error executing statement: {}", statement, e); + System.err.println("Error executing statement: " + statement + "\n" + e); + throw e; + } log .info( "executed in {}", From 54e11b6a4352862b762b5127fe38a5cea28092c0 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 3 Jul 2024 13:03:15 +0300 Subject: [PATCH 09/19] Improve performance and efficiency by rewriting the creation process of "publication", "project", "dataset", "datasource", "software", "otherresearchproduct" and "result" tables, to be performed in a single query, for each one. --- .../graph/stats/oozie_app/scripts/step11.sql | 102 ---------------- .../graph/stats/oozie_app/scripts/step12.sql | 40 +----- .../stats/oozie_app/scripts/step16_5.sql | 74 ++++-------- .../graph/stats/oozie_app/scripts/step2.sql | 68 +++++------ .../graph/stats/oozie_app/scripts/step3.sql | 65 +++++----- .../graph/stats/oozie_app/scripts/step4.sql | 68 +++++------ .../graph/stats/oozie_app/scripts/step5.sql | 66 +++++----- .../graph/stats/oozie_app/scripts/step6.sql | 114 ++++++++++-------- .../graph/stats/oozie_app/scripts/step7.sql | 10 +- .../graph/stats/oozie_app/scripts/step8.sql | 99 +++++---------- 10 files changed, 248 insertions(+), 458 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 207c1b1243..7597f14293 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -4,108 +4,6 @@ ---------------------------------------------------------------- ---------------------------------------------------------------- ---Datasource temporary table updates -UPDATE ${stats_db_name}.datasource_tmp -SET harvested='true' -WHERE datasource_tmp.id IN (SELECT DISTINCT d.id - FROM ${stats_db_name}.datasource_tmp d, - ${stats_db_name}.result_datasources rd - WHERE d.id = rd.datasource); -- /*EOS*/ - --- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables -UPDATE ${stats_db_name}.project_tmp -SET haspubs='yes' -WHERE project_tmp.id IN (SELECT pr.id - FROM ${stats_db_name}.project_results pr, - ${stats_db_name}.result r - WHERE pr.result = r.id - AND r.type = 'publication'); -- /*EOS*/ - -DROP TABLE IF EXISTS ${stats_db_name}.project purge; -- /*EOS*/ - -CREATE TABLE ${stats_db_name}.project stored as parquet as -SELECT p.id, - p.acronym, - p.title, - p.funder, - p.funding_lvl0, - p.funding_lvl1, - p.funding_lvl2, - p.ec39, - p.type, - p.startdate, - p.enddate, - p.start_year, - p.end_year, - p.duration, - CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs, - CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs, - CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub, - CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, - p.callidentifier, - p.code, - p.totalcost, - p.fundedamount, - p.currency -FROM ${stats_db_name}.project_tmp p - LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np - FROM ${stats_db_name}.project_results pr - INNER JOIN ${stats_db_name}.result r ON pr.result = r.id - WHERE r.type = 'publication' - GROUP BY pr.id) AS prr1 on prr1.id = p.id - LEFT JOIN (SELECT pp.id, - max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub, - count(distinct r.id) AS dp - FROM ${stats_db_name}.project_tmp pp, - ${stats_db_name}.project_results pr, - ${stats_db_name}.result r - WHERE pp.id = pr.id - AND pr.result = r.id - AND r.type = 'publication' - AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 - GROUP BY pp.id) AS prr2 - ON prr2.id = p.id; -- /*EOS*/ - -UPDATE ${stats_db_name}.publication_tmp -SET delayed = 'yes' -WHERE publication_tmp.id IN (SELECT distinct r.id - FROM ${stats_db_name}.result r, - ${stats_db_name}.project_results pr, - ${stats_db_name}.project_tmp p - WHERE r.id = pr.result - AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ - -UPDATE ${stats_db_name}.dataset_tmp -SET delayed = 'yes' -WHERE dataset_tmp.id IN (SELECT distinct r.id - FROM ${stats_db_name}.result r, - ${stats_db_name}.project_results pr, - ${stats_db_name}.project_tmp p - WHERE r.id = pr.result - AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ - -UPDATE ${stats_db_name}.software_tmp -SET delayed = 'yes' -WHERE software_tmp.id IN (SELECT distinct r.id - FROM ${stats_db_name}.result r, - ${stats_db_name}.project_results pr, - ${stats_db_name}.project_tmp p - WHERE r.id = pr.result - AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ - -UPDATE ${stats_db_name}.otherresearchproduct_tmp -SET delayed = 'yes' -WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id - FROM ${stats_db_name}.result r, - ${stats_db_name}.project_results pr, - ${stats_db_name}.project_tmp p - WHERE r.id = pr.result - AND pr.id = p.id - AND to_date(r.date) - to_date(p.enddate) > 0); -- /*EOS*/ - CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index ff95524be8..4f0b45fed2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -1,42 +1,4 @@ ------------------------------------------------------------------------------------------------------- --- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables ------------------------------------------------------------------------------------------------------- -DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/ - -CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS -SELECT * -FROM ${stats_db_name}.datasource_tmp; /*EOS*/ - -DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/ - -CREATE TABLE ${stats_db_name}.publication stored AS parquet AS -SELECT * -FROM ${stats_db_name}.publication_tmp; /*EOS*/ - -DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/ - -CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS -SELECT * -FROM ${stats_db_name}.dataset_tmp; /*EOS*/ - -DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/ - -CREATE TABLE ${stats_db_name}.software stored AS parquet AS -SELECT * -FROM ${stats_db_name}.software_tmp; /*EOS*/ - -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/ - -CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS -SELECT * -FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ - -DROP TABLE ${stats_db_name}.project_tmp; /*EOS*/ -DROP TABLE ${stats_db_name}.datasource_tmp; /*EOS*/ -DROP TABLE ${stats_db_name}.publication_tmp; /*EOS*/ -DROP TABLE ${stats_db_name}.dataset_tmp; /*EOS*/ -DROP TABLE ${stats_db_name}.software_tmp; /*EOS*/ -DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ +set mapred.job.queue.name=analytics; /*EOS*/ ---------------------------------------------- -- Re-creating views from final parquet tables diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index 416298e4c4..a2be22603d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -1,58 +1,26 @@ set mapred.job.queue.name=analytics; /*EOS*/ --- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, +-- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold, -- peer reviewed) -drop table if exists ${stats_db_name}.result_tmp; /*EOS*/ -CREATE TABLE ${stats_db_name}.result_tmp ( - id STRING, - title STRING, - publisher STRING, - journal STRING, - `date` STRING, - `year` INT, - bestlicence STRING, - access_mode STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING , - peer_reviewed BOOLEAN, - green BOOLEAN, - gold BOOLEAN) -clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); /*EOS*/ - -insert into ${stats_db_name}.result_tmp -select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold -FROM ${stats_db_name}.publication r -LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ - -insert into ${stats_db_name}.result_tmp -select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold -FROM ${stats_db_name}.dataset r -LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ - -insert into ${stats_db_name}.result_tmp -select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold -FROM ${stats_db_name}.software r -LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ - -insert into ${stats_db_name}.result_tmp -select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold -FROM ${stats_db_name}.otherresearchproduct r -LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id -LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ - -drop table if exists ${stats_db_name}.result; /*EOS*/ drop view if exists ${stats_db_name}.result; /*EOS*/ -create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; /*EOS*/ -drop table ${stats_db_name}.result_tmp; /*EOS*/ \ No newline at end of file +drop table if exists ${stats_db_name}.result; /*EOS*/ + +CREATE TABLE ${stats_db_name}.result stored as parquet as +SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold +FROM ( + (SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type + FROM ${stats_db_name}.publication) + UNION ALL + (SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type + FROM ${stats_db_name}.dataset) + UNION ALL + (select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type + FROM ${stats_db_name}.software) + UNION ALL + (select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type + FROM ${stats_db_name}.otherresearchproduct) + ) r +LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id +LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id +LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index f0e5ce0910..0abec2358d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -7,41 +7,41 @@ set mapred.job.queue.name=analytics; /*EOS*/ -------------------------------------------------------------- -- Publication temporary table -DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.publication_tmp -( - id STRING, - title STRING, - publisher STRING, - journal STRING, - date STRING, - year STRING, - bestlicence STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING -) - clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/ + +CREATE TABLE ${stats_db_name}.publication stored as parquet as +with pub_pr as ( + select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed + from ${openaire_db_name}.publication pub + join ${openaire_db_name}.relation rel + on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false + where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false +), + pub_delayed as ( + select pub_id, max(delayed) as delayed + from pub_pr + group by pub_id + ) +select /*+ COALESCE(100) */ + substr(pub.id, 4) as id, + pub.title[0].value as title, + pub.publisher.value as publisher, + pub.journal.name as journal, + pub.dateofacceptance.value as date, + date_format(pub.dateofacceptance.value, 'yyyy') as year, + pub.bestaccessright.classname as bestlicence, + pub.embargoenddate.value as embargo_end_date, + coalesce(pub_delayed.delayed, false) as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects. + size(pub.author) as authors, + concat_ws('\u003B', pub.source.value) as source, + case when size(pub.description) > 0 then true else false end as abstract, + 'publication' as type +from ${openaire_db_name}.publication pub + left outer join pub_delayed on pub.id=pub_delayed.pub_id +where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.publication_tmp -SELECT substr(p.id, 4) as id, - p.title[0].value as title, - p.publisher.value as publisher, - p.journal.name as journal, - p.dateofacceptance.value as date, - date_format(p.dateofacceptance.value, 'yyyy') as year, - p.bestaccessright.classname as bestlicence, - p.embargoenddate.value as embargo_end_date, - false as delayed, - size(p.author) as authors, - concat_ws('\u003B', p.source.value) as source, - case when size(p.description) > 0 then true else false end as abstract, - 'publication' as type -from ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index ebedb5dc5a..8f203fc838 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -5,42 +5,41 @@ ------------------------------------------------------ -- Dataset temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.dataset_tmp -( - id STRING, - title STRING, - publisher STRING, - journal STRING, - date STRING, - year STRING, - bestlicence STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING +CREATE TABLE ${stats_db_name}.dataset stored as parquet as +with datast_pr as ( + select datast.id as datast_id, case when (to_date(datast.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed + from ${openaire_db_name}.dataset datast + join ${openaire_db_name}.relation rel + on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=datast.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false + where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false +), +datast_delayed as ( + select datast_id, max(delayed) as delayed + from datast_pr + group by datast_id ) - clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ +select /*+ COALESCE(100) */ + substr(datast.id, 4) as id, + datast.title[0].value as title, + datast.publisher.value as publisher, + cast(null as string) as journal, + datast.dateofacceptance.value as date, + date_format(datast.dateofacceptance.value, 'yyyy') as year, + datast.bestaccessright.classname as bestlicence, + datast.embargoenddate.value as embargo_end_date, + coalesce(datast_delayed.delayed, false) as delayed, -- It's delayed, when the dataset was published after the end of the project. + size(datast.author) as authors, + concat_ws('\u003B', datast.source.value) as source, + case when size(datast.description) > 0 then true else false end as abstract, + 'dataset' as type +from ${openaire_db_name}.dataset datast + left outer join datast_delayed on datast.id=datast_delayed.datast_id +where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.dataset_tmp -SELECT substr(d.id, 4) AS id, - d.title[0].value AS title, - d.publisher.value AS publisher, - cast(null AS string) AS journal, - d.dateofacceptance.value as date, - date_format(d.dateofacceptance.value, 'yyyy') AS year, - d.bestaccessright.classname AS bestlicence, - d.embargoenddate.value AS embargo_end_date, - false AS delayed, - size(d.author) AS authors, - concat_ws('\u003B', d.source.value) AS source, - CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, - 'dataset' AS type -FROM ${openaire_db_name}.dataset d -WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 4957d8d2f2..5e4c9f4201 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -5,41 +5,41 @@ -------------------------------------------------------- -- Software temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.software_tmp -( - id STRING, - title STRING, - publisher STRING, - journal STRING, - date STRING, - year STRING, - bestlicence STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING -) - clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/ + +CREATE TABLE ${stats_db_name}.software stored as parquet as +with soft_pr as ( + select soft.id as soft_id, case when (to_date(soft.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed + from ${openaire_db_name}.software soft + join ${openaire_db_name}.relation rel + on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=soft.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false + where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false +), +soft_delayed as ( + select soft_id, max(delayed) as delayed + from soft_pr + group by soft_id +) +select /*+ COALESCE(100) */ + substr(soft.id, 4) as id, + soft.title[0].value as title, + soft.publisher.value as publisher, + cast(null as string) as journal, + soft.dateofacceptance.value as date, + date_format(soft.dateofacceptance.value, 'yyyy') as year, + soft.bestaccessright.classname as bestlicence, + soft.embargoenddate.value as embargo_end_date, + coalesce(soft_delayed.delayed, false) as delayed, -- It's delayed, when the software was published after the end of the project. + size(soft.author) as authors, + concat_ws('\u003B', soft.source.value) as source, + case when size(soft.description) > 0 then true else false end as abstract, + 'software' as type +from ${openaire_db_name}.software soft + left outer join soft_delayed on soft.id=soft_delayed.soft_id +where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.software_tmp -SELECT substr(s.id, 4) as id, - s.title[0].value AS title, - s.publisher.value AS publisher, - CAST(NULL AS string) AS journal, - s.dateofacceptance.value AS DATE, - date_format(s.dateofacceptance.value, 'yyyy') AS YEAR, - s.bestaccessright.classname AS bestlicence, - s.embargoenddate.value AS embargo_end_date, - FALSE AS delayed, - SIZE(s.author) AS authors, - concat_ws('\u003B', s.source.value) AS source, - CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, - 'software' as type -from ${openaire_db_name}.software s -where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index 820ec43959..3134ba316b 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -5,41 +5,41 @@ -------------------------------------------------------------------------------- -- Otherresearchproduct temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp -( - id STRING, - title STRING, - publisher STRING, - journal STRING, - date STRING, - year STRING, - bestlicence STRING, - embargo_end_date STRING, - delayed BOOLEAN, - authors INT, - source STRING, - abstract BOOLEAN, - type STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ +CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as +with other_pr as ( + select other.id as other_id, case when (to_date(other.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed + from ${openaire_db_name}.otherresearchproduct other + join ${openaire_db_name}.relation rel + on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=other.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false + where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false +), +other_delayed as ( + select other_id, max(delayed) as delayed + from other_pr + group by other_id +) +select /*+ COALESCE(100) */ + substr(other.id, 4) as id, + other.title[0].value as title, + other.publisher.value as publisher, + cast(null as string) as journal, + other.dateofacceptance.value as date, + date_format(other.dateofacceptance.value, 'yyyy') as year, + other.bestaccessright.classname as bestlicence, + other.embargoenddate.value as embargo_end_date, + false as delayed, + size(other.author) as authors, + concat_ws('\u003B', other.source.value) as source, + case when size(other.description) > 0 then true else false end as abstract, + 'other' as type +from ${openaire_db_name}.otherresearchproduct other + left outer join other_delayed on other.id=other_delayed.other_id +where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.otherresearchproduct_tmp -SELECT substr(o.id, 4) AS id, - o.title[0].value AS title, - o.publisher.value AS publisher, - CAST(NULL AS string) AS journal, - o.dateofacceptance.value AS DATE, - date_format(o.dateofacceptance.value, 'yyyy') AS year, - o.bestaccessright.classname AS bestlicence, - o.embargoenddate.value as embargo_end_date, - FALSE AS delayed, - SIZE(o.author) AS authors, - concat_ws('\u003B', o.source.value) AS source, - CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, - 'other' AS type -FROM ${openaire_db_name}.otherresearchproduct o -WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false; /*EOS*/ -- Otherresearchproduct_citations DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index d2688ec073..de6b803ab7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -34,61 +34,69 @@ from ${openaire_db_name}.project p lateral view explode(p.h2020classification) classifs as class where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge; /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.project_tmp -( - id STRING, - acronym STRING, - title STRING, - funder STRING, - funding_lvl0 STRING, - funding_lvl1 STRING, - funding_lvl2 STRING, - ec39 STRING, - type STRING, - startdate STRING, - enddate STRING, - start_year INT, - end_year INT, - duration INT, - haspubs STRING, - numpubs INT, - daysforlastpub INT, - delayedpubs INT, - callidentifier STRING, - code STRING, - totalcost FLOAT, - fundedamount FLOAT, - currency STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); /*EOS*/ +CREATE TABLE ${stats_db_name}.project stored as parquet as +with pr_pub as ( + select pr.id as pr_id, pub.id as pub_id, + (case when datediff(pub.dt_dateofacceptance, pr.dt_enddate) > 0 then true else false end) as delayed, + max(datediff(pub.dt_dateofacceptance, pr.dt_enddate)) as daysForlastPub + from (select id, to_date(dateofacceptance.value) as dt_dateofacceptance from ${openaire_db_name}.publication + where datainfo.deletedbyinference = false and datainfo.invisible = false) pub + join ${openaire_db_name}.relation rel + on rel.reltype = 'resultProject' and rel.relclass = 'isProducedBy' and rel.source=pub.id + and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false + join (select id, to_date(enddate.value) as dt_enddate from ${openaire_db_name}.project + where datainfo.deletedbyinference = false and datainfo.invisible = false) pr + on pr.id=rel.target + group by pr.id, pub.id, pub.dt_dateofacceptance, pr.dt_enddate +), +num_pubs_pr as ( + select pr_id, count( distinct pub_id) as num_pubs + from pr_pub + group by pr_id +), +pub_delayed as ( + select pr_id, pub_id, max(delayed) as delayed + from pr_pub + group by pr_id, pub_id +), +num_pub_delayed as ( + select pr_id, count(distinct pub_id) as num_delayed + from pub_delayed + where delayed + group by pr_id +) +select /*+ COALESCE(100) */ + substr(p.id, 4) as id, + p.acronym.value as acronym, + p.title.value as title, + xpath_string(p.fundingtree[0].value, '//funder/name') as funder, + xpath_string(p.fundingtree[0].value, '//funding_level_0/name') as funding_lvl0, + xpath_string(p.fundingtree[0].value, '//funding_level_1/name') as funding_lvl1, + xpath_string(p.fundingtree[0].value, '//funding_level_2/name') as funding_lvl2, + p.ecsc39.value as ec39, + p.contracttype.classname as type, + p.startdate.value as startdate, + p.enddate.value as enddate, + year(p.startdate.value) as start_year, + year(p.enddate.value) as end_year, + cast(months_between(p.enddate.value, p.startdate.value) as int) as duration, + case when pr_pub.pub_id is null then 'no' else 'yes' end as haspubs, + num_pubs_pr.num_pubs as numpubs, + pr_pub.daysForlastPub as daysForlastPub, + npd.num_delayed as delayedpubs, + p.callidentifier.value as callidentifier, + p.code.value as code, + p.totalcost as totalcost, + p.fundedamount as fundedamount, + p.currency.value as currency +from ${openaire_db_name}.project p +left outer join pr_pub on pr_pub.pr_id = p.id +left outer join num_pubs_pr on num_pubs_pr.pr_id = p.id +left outer join num_pub_delayed npd on npd.pr_id=p.id +where p.datainfo.deletedbyinference = false and p.datainfo.invisible = false; /*EOS*/ -INSERT INTO ${stats_db_name}.project_tmp -SELECT substr(p.id, 4) AS id, - p.acronym.value AS acronym, - p.title.value AS title, - xpath_string(p.fundingtree[0].value, '//funder/name') AS funder, - xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0, - xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1, - xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2, - p.ecsc39.value AS ec39, - p.contracttype.classname AS type, - p.startdate.value AS startdate, - p.enddate.value AS enddate, - year(p.startdate.value) AS start_year, - year(p.enddate.value) AS end_year, - CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration, - 'no' AS haspubs, - 0 AS numpubs, - 0 AS daysforlastpub, - 0 AS delayedpubs, - p.callidentifier.value AS callidentifier, - p.code.value AS code, - p.totalcost AS totalcost, - p.fundedamount AS fundedamount, - p.currency.value AS currency -FROM ${openaire_db_name}.project p -WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 7a23991fe5..2283e62917 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -7,16 +7,16 @@ -- Views on temporary tables that should be re-created in the end CREATE OR REPLACE VIEW ${stats_db_name}.result as SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.publication_tmp +FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.software_tmp +FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.dataset_tmp +FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode -FROM ${stats_db_name}.otherresearchproduct_tmp; /*EOS*/ +FROM ${stats_db_name}.otherresearchproduct; /*EOS*/ -- Views on final tables CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS @@ -153,4 +153,4 @@ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result - JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; /*EOS*/ \ No newline at end of file + JOIN ${stats_db_name}.project p ON p.id = pr.id; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 90c3ebef6a..b280dc9469 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -5,81 +5,36 @@ -- Datasource table/view and Datasource related tables/views ------------------------------------------------------------ ------------------------------------------------------------ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge; -- /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/ -CREATE TABLE ${stats_db_name}.datasource_tmp -( - `id` string, - `name` STRING, - `type` STRING, - `dateofvalidation` STRING, - `yearofvalidation` string, - `harvested` BOOLEAN, - `piwik_id` INT, - `latitude` STRING, - `longitude` STRING, - `websiteurl` STRING, - `compatibility` STRING, - issn_printed STRING, - issn_online STRING -) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -- /*EOS*/ +CREATE TABLE ${stats_db_name}.datasource stored as parquet as +with piwik_datasource as ( + select id, split(originalidd, '\\:')[1] as piwik_id + from ${openaire_db_name}.datasource + lateral view explode(originalid) temp as originalidd + where originalidd like "piwik:%" +) +select /*+ COALESCE(100) */ + substr(dtrce.id, 4) as id, + case when dtrce.officialname.value='Unknown Repository' then 'Other' else dtrce.officialname.value end as name, + dtrce.datasourcetype.classname as type, + dtrce.dateofvalidation.value as dateofvalidation, + case when dtrce.dateofvalidation.value='-1' then null else date_format(dtrce.dateofvalidation.value, 'yyyy') end as yearofvalidation, + case when res.d_id is null then false else true end as harvested, + case when piwik_d.piwik_id is null then 0 else piwik_d.piwik_id end as piwik_id, + dtrce.latitude.value as latitude, + dtrce.longitude.value as longitude, + dtrce.websiteurl.value as websiteurl, + dtrce.openairecompatibility.classid as compatibility, + dtrce.journal.issnprinted as issn_printed, + dtrce.journal.issnonline as issn_online +from ${openaire_db_name}.datasource dtrce + left outer join (select inst.hostedby.key as d_id from ${openaire_db_name}.result lateral view outer explode (instance) insts as inst) res on res.d_id=dtrce.id + left outer join piwik_datasource piwik_d on piwik_d.id=dtrce.id +where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = false; /*EOS*/ --- Insert statement that takes into account the piwik_id of the openAIRE graph -INSERT INTO ${stats_db_name}.datasource_tmp -SELECT substr(d1.id, 4) AS id, - officialname.value AS name, - datasourcetype.classname AS type, - dateofvalidation.value AS dateofvalidation, - date_format(d1.dateofvalidation.value, 'yyyy') AS yearofvalidation, - FALSE AS harvested, - CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id, - d1.latitude.value AS latitude, - d1.longitude.value AS longitude, - d1.websiteurl.value AS websiteurl, - d1.openairecompatibility.classid AS compatibility, - d1.journal.issnprinted AS issn_printed, - d1.journal.issnonline AS issn_online -FROM ${openaire_db_name}.datasource d1 - LEFT OUTER JOIN - (SELECT id, split(originalidd, '\\:')[1] as piwik_id - FROM ${openaire_db_name}.datasource - LATERAL VIEW EXPLODE(originalid) temp AS originalidd - WHERE originalidd like "piwik:%") AS d2 - ON d1.id = d2.id -WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- /*EOS*/ --- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. --- Creating a temporary dual table that will be removed after the following insert - -DROP TABLE IF EXISTS ${stats_db_name}.dual purge; -- /*EOS*/ - -CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); -- /*EOS*/ - -INSERT INTO ${stats_db_name}.dual VALUES ('X'); -- /*EOS*/ - -INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, - `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) -SELECT 'other', - 'Other', - 'Repository', - NULL, - NULL, - false, - 0, - NULL, - NULL, - NULL, - 'unknown', - null, - null -FROM ${stats_db_name}.dual -WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); -- /*EOS*/ -DROP TABLE ${stats_db_name}.dual; -- /*EOS*/ - -UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; -- /*EOS*/ -UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; -- /*EOS*/ - -DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; -- /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language From aa4d7d5e20370435e7863a16526a113aa80bcb6f Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 3 Jul 2024 19:14:25 +0300 Subject: [PATCH 10/19] Prioritize the rest of the stats-queries over other tasks on the cluster, by putting them in the "analytics" queue. --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql | 2 ++ .../oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 2 ++ .../graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql | 2 ++ .../stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql | 2 ++ .../stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql | 2 ++ .../stats/oozie_app/scripts/step20-createMonitorDB_funded.sql | 2 ++ .../oozie_app/scripts/step20-createMonitorDB_institutions.sql | 2 ++ .../stats/oozie_app/scripts/step21-createObservatoryDB.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql | 2 ++ 18 files changed, 36 insertions(+) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql index 467a98872c..4551d62821 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step1.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + -------------------------------------------------------------- -------------------------------------------------------------- -- Stats database creation diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 9088ce2052..48d8961ff5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 7597f14293..e98a778dbd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ---------------------------------------------------------------- ---------------------------------------------------------------- -- Post processing - Updates on main tables diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 68a46ded33..20784bce9b 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------ ------------------------------------------------------ -- Additional relations diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 4abb6bdbce..4940bb96d9 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.context as select * from SOURCE.context; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index a8392b2267..7e31408bcd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + drop database if exists TARGET cascade; create database if not exists TARGET; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql index 4469782f0a..9dab792227 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql @@ -1,5 +1,7 @@ drop database if exists TARGET cascade; create database if not exists TARGET; +set mapred.job.queue.name=analytics; /*EOS*/ + create table TARGET.result stored as parquet as select distinct * from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql index a28206d56b..c6ef15d45d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql @@ -1,5 +1,7 @@ drop database if exists TARGET cascade; create database if not exists TARGET; +set mapred.job.queue.name=analytics; /*EOS*/ + create table TARGET.result stored as parquet as select distinct * from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql index ce6475c222..8e3db49ff2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql @@ -1,5 +1,7 @@ drop database if exists TARGET cascade; create database if not exists TARGET; +set mapred.job.queue.name=analytics; /*EOS*/ + create table TARGET.result stored as parquet as select distinct * from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql index 62c68c625b..286ffe427c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql @@ -1,5 +1,7 @@ drop database if exists TARGET cascade; create database if not exists TARGET; +set mapred.job.queue.name=analytics; /*EOS*/ + create table TARGET.result stored as parquet as select distinct * from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index adcf23b7ad..1a7f34e96d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + create table ${observatory_db_name}.result_cc_licence stored as parquet as select r.id, coalesce(rln.count, 0) > 0 as cc_licence from ${stats_db_name}.result r diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 8f203fc838..0e1e02b120 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------ ------------------------------------------------------ -- Dataset table/view and Dataset related tables/views diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 5e4c9f4201..0ccb17fccf 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + -------------------------------------------------------- -------------------------------------------------------- -- Software table/view and Software related tables/views diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index 3134ba316b..cd7834d841 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- Otherresearchproduct table/view and Otherresearchproduct related tables/views diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index de6b803ab7..d261c96e26 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------ ------------------------------------------------------ -- Project table/view and Project related tables/views diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 2283e62917..6cab86a414 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ---------------------------------------------------- ---------------------------------------------------- -- Result table/view and Result related tables/views diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index b280dc9469..d0b6abad93 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + -- noinspection SqlNoDataSourceInspectionForFile ------------------------------------------------------------ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index 1d76b89a66..f504a5c127 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -1,3 +1,5 @@ +set mapred.job.queue.name=analytics; /*EOS*/ + ---------------------------------------------------------------- ---------------------------------------------------------------- -- Organization table/view and Organization related tables/views From 7ce051d7668c6d8081a512d4c623d701891ecc7b Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 3 Jul 2024 19:49:19 +0300 Subject: [PATCH 11/19] - Update the remaining hive-actions to spark-actions. - Update the version of shell-actions. - Fix missing "/*EOS*/" indicators. --- .../graph/stats/oozie_app/scripts/step11.sql | 2 +- .../graph/stats/oozie_app/scripts/step14.sql | 2 +- .../graph/stats/oozie_app/scripts/step7.sql | 6 +- .../graph/stats/oozie_app/scripts/step8.sql | 16 ++--- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 70 ++++++------------- 5 files changed, 36 insertions(+), 60 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index e98a778dbd..48373af9b7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -17,4 +17,4 @@ FROM ${stats_db_name}.result_projects, ${stats_db_name}.project WHERE result_projects.id = result.id AND result.type = 'publication' - AND project.id = result_projects.project; -- /*EOS*/ \ No newline at end of file + AND project.id = result_projects.project; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 163e924158..9bbf944806 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -66,4 +66,4 @@ DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute -WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; +WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 6cab86a414..1323adf807 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -135,9 +135,9 @@ with lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification') select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4 from lvl1 - join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) - join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) - join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); + join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) + join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) + join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index d0b6abad93..b15efac4a1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -41,31 +41,31 @@ DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/ +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; -- /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids -where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/ +where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ -DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; -- /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; -- /*EOS*/ +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/ -- datasource sources: -- where the datasource info have been collected from. -DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; -- /*EOS*/ +DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/ create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS select substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf -where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -- /*EOS*/ +where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; -- /*EOS*/ +FROM ${stats_db_name}.result_datasources; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index d6fc864c39..8422af4c9a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -307,7 +307,7 @@ - - - - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - - - @@ -381,12 +370,11 @@ --openaire_db_name${openaire_db_name} --external_stats_db_name${external_stats_db_name} - - + - - - - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - openaire_db_name=${openaire_db_name} - external_stats_db_name=${external_stats_db_name} - - - @@ -533,7 +509,7 @@ - + ${jobTracker} ${nameNode} contexts.sh @@ -624,7 +600,7 @@ - + ${jobTracker} ${nameNode} finalizedb.sh @@ -637,7 +613,7 @@ - + ${jobTracker} ${nameNode} monitor.sh @@ -670,7 +646,7 @@ - + ${jobTracker} ${nameNode} observatory-pre.sh @@ -706,7 +682,7 @@ - + ${jobTracker} ${nameNode} observatory-post.sh @@ -719,7 +695,7 @@ - + ${jobTracker} ${nameNode} copyDataToImpalaCluster.sh @@ -738,7 +714,7 @@ - + ${jobTracker} ${nameNode} createPDFsAggregated.sh @@ -754,7 +730,7 @@ - + ${jobTracker} ${nameNode} finalizeImpalaCluster.sh @@ -773,7 +749,7 @@ - + ${jobTracker} ${nameNode} updateCache.sh From 7b7dd32ad5e4eb08d995ae04dc909b87eed01875 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 3 Jul 2024 19:53:24 +0300 Subject: [PATCH 12/19] - Fix placement of some "set mapred.job.queue.name=analytics" statements and remove their unused "/*EOS*/" indicator. - Add stacktrace-info to failed actions. --- .../stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql | 4 ++-- .../oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql | 4 ++-- .../stats/oozie_app/scripts/step20-createMonitorDB_funded.sql | 4 ++-- .../oozie_app/scripts/step20-createMonitorDB_institutions.sql | 4 ++-- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql | 2 ++ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql | 2 +- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 3 ++- 7 files changed, 13 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql index 9dab792227..0f3dc1d4f2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs.sql @@ -1,7 +1,7 @@ +set mapred.job.queue.name=analytics; + drop database if exists TARGET cascade; create database if not exists TARGET; -set mapred.job.queue.name=analytics; /*EOS*/ - create table TARGET.result stored as parquet as select distinct * from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql index c6ef15d45d..2a082c2cd0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_RIs_tail.sql @@ -1,7 +1,7 @@ +set mapred.job.queue.name=analytics; + drop database if exists TARGET cascade; create database if not exists TARGET; -set mapred.job.queue.name=analytics; /*EOS*/ - create table TARGET.result stored as parquet as select distinct * from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql index 8e3db49ff2..759843d680 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_funded.sql @@ -1,7 +1,7 @@ +set mapred.job.queue.name=analytics; + drop database if exists TARGET cascade; create database if not exists TARGET; -set mapred.job.queue.name=analytics; /*EOS*/ - create table TARGET.result stored as parquet as select distinct * from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql index 286ffe427c..118ccddac7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql @@ -1,7 +1,7 @@ +set mapred.job.queue.name=analytics; + drop database if exists TARGET cascade; create database if not exists TARGET; -set mapred.job.queue.name=analytics; /*EOS*/ - create table TARGET.result stored as parquet as select distinct * from ( diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 1323adf807..0717d7897b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -139,6 +139,8 @@ from lvl1 join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); /*EOS*/ +DROP TABLE ${stats_db_name}.result_fos_base_tmp purge; /*EOS*/ + DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index b15efac4a1..9a3995a8fc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -67,5 +67,5 @@ from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfro where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS -SELECT datasource AS id, id AS result +SELECT /*+ COALESCE(100) */ datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 8422af4c9a..d08cf8f595 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -150,7 +150,8 @@ - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())} + ${wf:actionData(wf:lastErrorNode())['stackTrace']}] From ce0aee21cce147911d8e3cdc7c5d3c6606cfb012 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 3 Jul 2024 20:15:33 +0300 Subject: [PATCH 13/19] Improve performance of transferring the stats-DBs to another cluster and querying the DBs' tables, by ordering Spark to create up to 100 files per table, instead of thousands. --- .../graph/stats/oozie_app/scripts/step13.sql | 16 +- .../graph/stats/oozie_app/scripts/step14.sql | 17 +- .../graph/stats/oozie_app/scripts/step15.sql | 12 +- .../stats/oozie_app/scripts/step15_5.sql | 10 +- .../scripts/step16-createIndicatorsTables.sql | 182 +++++++++--------- .../scripts/step16_1-definitions.sql | 6 +- .../scripts/step21-createObservatoryDB.sql | 38 ++-- .../graph/stats/oozie_app/scripts/step8.sql | 10 +- 8 files changed, 145 insertions(+), 146 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 20784bce9b..a590c190ea 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -10,7 +10,7 @@ set mapred.job.queue.name=analytics; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p @@ -23,7 +23,7 @@ LEFT OUTER JOIN DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p @@ -36,7 +36,7 @@ LEFT OUTER JOIN DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p @@ -49,7 +49,7 @@ LEFT OUTER JOIN DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as -SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource +SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM ( SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p @@ -71,7 +71,7 @@ SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as -select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid +select /*+ COALESCE(100) */ distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid from ( SELECT substr(res.id, 4) as id, auth_pid.value as orcid FROM ${openaire_db_name}.result res @@ -83,7 +83,7 @@ from ( DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as -select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype +select /*+ COALESCE(100) */ substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r2 on r2.id=rel.target @@ -98,7 +98,7 @@ where reltype='resultResult' DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as -select substr(target, 4) as id, count(distinct substr(source, 4)) as citations +select /*+ COALESCE(100) */ substr(target, 4) as id, count(distinct substr(source, 4)) as citations from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r2 on r2.id=rel.target @@ -115,7 +115,7 @@ group by substr(target, 4); /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as -select substr(source, 4) as id, count(distinct substr(target, 4)) as references +select /*+ COALESCE(100) */ substr(source, 4) as id, count(distinct substr(target, 4)) as references from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r2 on r2.id=rel.target diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 9bbf944806..9e71b88f5b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -1,4 +1,5 @@ set mapred.job.queue.name=analytics; /*EOS*/ + ------------------------------------------------------ ------------------------------------------------------ -- Additional relations @@ -9,28 +10,28 @@ set mapred.job.queue.name=analytics; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS -SELECT substr(p.id, 4) as id, licenses.value as type +SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/ @@ -46,15 +47,15 @@ SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS -select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid +select /*+ COALESCE(100) */ substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as -SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource +SELECT /*+ COALESCE(100) */ o.id, case when d.id is null then 'other' else o.datasource end as datasource FROM ( - SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource + SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o LEFT OUTER JOIN ( SELECT substr(d.id, 4) id @@ -64,6 +65,6 @@ FROM ( DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/ CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as -select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result +select /*+ COALESCE(100) */ distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/ diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 1f3027b7df..08609affff 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -18,7 +18,7 @@ non_peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') -select distinct * +select /*+ COALESCE(100) */ distinct * from ( select peer_reviewed.* from peer_reviewed union all @@ -36,7 +36,7 @@ non_peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') -select distinct * +select /*+ COALESCE(100) */ distinct * from ( select peer_reviewed.* from peer_reviewed union all @@ -54,7 +54,7 @@ non_peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') -select distinct * +select /*+ COALESCE(100) */ distinct * from ( select peer_reviewed.* from peer_reviewed union all @@ -72,7 +72,7 @@ non_peer_reviewed as ( select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') -select distinct * +select /*+ COALESCE(100) */ distinct * from ( select peer_reviewed.* from peer_reviewed union all @@ -92,7 +92,7 @@ select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as -select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, +select /*+ COALESCE(100) */ substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/ @@ -100,7 +100,7 @@ where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/ create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as -select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, +select /*+ COALESCE(100) */ distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, cast(rel.properties[0].value as double) apc_amount, rel.properties[1].value apc_currency from ${openaire_db_name}.relation rel diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index d18cf569fd..d61b4d2ef1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -6,7 +6,7 @@ set mapred.job.queue.name=analytics; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/ create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as -select r.id, count(distinct p.id) as count +select /*+ COALESCE(100) */ r.id, count(distinct p.id) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project @@ -15,7 +15,7 @@ group by r.id; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/ create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as -select r.id, count(distinct p.funder) as count +select /*+ COALESCE(100) */ r.id, count(distinct p.funder) as count from ${stats_db_name}.result r left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.project p on p.id=rp.project @@ -30,7 +30,7 @@ with rcount as ( left outer join ${stats_db_name}.result_projects rp on rp.project=p.id left outer join ${stats_db_name}.result r on r.id=rp.id group by r.type, p.id ) -select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications, +select /*+ COALESCE(100) */ rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications, sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets, sum(case when rcount.type='software' then rcount.count else 0 end) as software, sum(case when rcount.type='other' then rcount.count else 0 end) as other @@ -48,7 +48,7 @@ create or replace view ${stats_db_name}.graduatedoctorates as select * from stat DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/ create table if not exists ${stats_db_name}.result_instance stored as parquet as -select distinct r.* +select /*+ COALESCE(100) */ distinct r.* from ( select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid @@ -58,7 +58,7 @@ join ${stats_db_name}.result res on res.id=r.id; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/ create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as -select distinct r.id, r.amount, r.currency +select /*+ COALESCE(100) */ distinct r.id, r.amount, r.currency from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 5624874744..6e7f00b536 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,7 +1,7 @@ -- Sprint 1 ---- drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as -select distinct p.id, coalesce(green_oa, 0) as green_oa +select /*+ COALESCE(100) */ distinct p.id, coalesce(green_oa, 0) as green_oa from ${stats_db_name}.publication p left outer join ( select p.id, 1 as green_oa @@ -12,7 +12,7 @@ left outer join ( drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as -select distinct p.id, coalesce(grey_lit, 0) as grey_lit +select /*+ COALESCE(100) */ distinct p.id, coalesce(grey_lit, 0) as grey_lit from ${stats_db_name}.publication p left outer join ( select p.id, 1 as grey_lit @@ -23,7 +23,7 @@ left outer join ( drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as -select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref +select /*+ COALESCE(100) */ distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref from ${stats_db_name}.publication p left outer join ( select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri @@ -33,7 +33,7 @@ left outer join ( -- Sprint 2 ---- drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as -select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license +select /*+ COALESCE(100) */ distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from ${stats_db_name}.result r left outer join ( select r.id, license.type as lic from ${stats_db_name}.result r @@ -42,7 +42,7 @@ left outer join ( drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as -select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url +select /*+ COALESCE(100) */ distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url from ${stats_db_name}.result r left outer join ( select r.id, lower(parse_url(license.type, "HOST")) as lic_host @@ -52,12 +52,12 @@ left outer join ( drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as -select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract +select /*+ COALESCE(100) */ distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract from ${stats_db_name}.publication; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as -select distinct r.id, coalesce(has_orcid, 0) as has_orcid +select /*+ COALESCE(100) */ distinct r.id, coalesce(has_orcid, 0) as has_orcid from ${stats_db_name}.result r left outer join ( select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/ @@ -66,7 +66,7 @@ left outer join ( ---- Sprint 3 ---- drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as -select distinct r.result as id, coalesce(fundref, 0) as fundref +select /*+ COALESCE(100) */ distinct r.result as id, coalesce(fundref, 0) as fundref from ${stats_db_name}.project_results r left outer join ( select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/ @@ -77,7 +77,7 @@ create table if not exists ${stats_db_name}.indi_result_org_collab stored as par SELECT ro.organization organization, ro.id, o.name from ${stats_db_name}.result_organization ro join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null) - select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations + select /*+ COALESCE(100) */ o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations from tmp as o1 join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/ @@ -89,7 +89,7 @@ create table if not exists ${stats_db_name}.indi_result_org_country_collab store from ${stats_db_name}.result_organization ro join ${stats_db_name}.organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) - select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations + select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations from tmp as o1 join tmp as o2 on o1.id=o2.id where o1.id=o2.id and o1.country!=o2.country group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/ @@ -100,7 +100,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org stored as pa select o.id organization, o.name, ro.project as project from ${stats_db_name}.organization o join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null) - select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations + select /*+ COALESCE(100) */ o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations from tmp as o1 join tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.name<>o2.name @@ -112,7 +112,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org_country stor select o.id organization, o.name, o.country , ro.project as project from ${stats_db_name}.organization o join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null) - select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations + select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations from tmp as o1 join tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.country<>o2.country @@ -124,7 +124,7 @@ create table if not exists ${stats_db_name}.indi_funder_country_collab stored as join ${stats_db_name}.organization o on o.id=op.id join ${stats_db_name}.project p on p.id=op.project where country <> 'UNKNOWN') - select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations + select /*+ COALESCE(100) */ f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations from tmp as f1 join tmp as f2 on f1.project=f2.project where f1.country<>f2.country @@ -136,7 +136,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as select distinct country, ro.id as result from ${stats_db_name}.organization o join ${stats_db_name}.result_organization ro on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) - select o1.country country1, o2.country country2, count(o1.result) as collaborations + select /*+ COALESCE(100) */ o1.country country1, o2.country country2, count(o1.result) as collaborations from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.country<>o2.country @@ -146,7 +146,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as ---- Sprint 4 ---- drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as - select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal + select /*+ COALESCE(100) */ distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from ${stats_db_name}.publication_datasources pd left outer join ( select pd.id, 1 as in_diamond_journal @@ -157,7 +157,7 @@ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet a drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as - select distinct pd.id, coalesce(is_transformative, 0) as is_transformative + select /*+ COALESCE(100) */ distinct pd.id, coalesce(is_transformative, 0) as is_transformative from ${stats_db_name}.publication pd left outer join ( select pd.id, 1 as is_transformative @@ -168,7 +168,7 @@ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as - select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open + select /*+ COALESCE(100) */ distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from ${stats_db_name}.result_instance ri left outer join ( select ri.id, 1 as pub_closed_other_open @@ -182,14 +182,14 @@ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as ---- Sprint 5 ---- drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as - select id, count(id) as number_of_copies + select /*+ COALESCE(100) */ id, count(id) as number_of_copies from ${stats_db_name}.result_instance group by id; /*EOS*/ ---- Sprint 6 ---- drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as - SELECT result_id, sum(downloads) no_downloads + SELECT /*+ COALESCE(100) */ result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join ${stats_db_name}.publication on result_id=id where downloads>0 @@ -197,7 +197,7 @@ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as - SELECT result_id, repository_id, sum(downloads) no_downloads + SELECT /*+ COALESCE(100) */ result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join ${stats_db_name}.publication on result_id=id where downloads>0 @@ -205,14 +205,14 @@ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as - SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads + SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join ${stats_db_name}.publication on result_id=id where downloads>0 GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as - SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads + SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join ${stats_db_name}.publication on result_id=id where downloads>0 @@ -241,7 +241,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a UNION ALL select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo ) - SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold + SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_gold, 0) as is_gold FROM ${stats_db_name}.publication pd left outer join ( select pd.id, 1 as is_gold @@ -272,7 +272,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as FROM ${stats_db_name}.datasource WHERE issn_online IS NOT NULL ) as issn WHERE LENGTH(issn) > 7) - SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa + SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa FROM ${stats_db_name}.publication_datasources pd LEFT OUTER JOIN ( SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd @@ -284,7 +284,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as -select distinct p.id, coalesce(is_hybrid, 0) is_hybrid +select /*+ COALESCE(100) */ distinct p.id, coalesce(is_hybrid, 0) is_hybrid from ${stats_db_name}.publication p left outer join ( select p.id, 1 as is_hybrid @@ -313,7 +313,7 @@ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet where cast(year as int)>2003 group by ro.organization) --return results_fair/all_results - select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness + select /*+ COALESCE(100) */ allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization; /*EOS*/ @@ -336,7 +336,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as -select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness +select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness from allresults ar join result_fair rf on rf.organization=ar.organization; /*EOS*/ @@ -357,7 +357,7 @@ CREATE TEMPORARY VIEW allresults as select year, ro.organization, count(distinct drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as -select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness +select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/ @@ -381,7 +381,7 @@ CREATE TEMPORARY VIEW allresults as drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as -select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness +select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness from allresults ar join result_fair rf on rf.organization=ar.organization; /*EOS*/ @@ -404,7 +404,7 @@ CREATE TEMPORARY VIEW allresults as drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as - select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness + select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/ @@ -427,7 +427,7 @@ CREATE TEMPORARY VIEW allresults as drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as -select cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable +select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/ @@ -450,7 +450,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as -select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable +select /*+ COALESCE(100) */ allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/ @@ -516,7 +516,7 @@ select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsof drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as -select allpubsshare.organization, +select /*+ COALESCE(100) */ allpubsshare.organization, (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare @@ -593,7 +593,7 @@ select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/all drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as -select cast(allpubsshare.year as int) year, allpubsshare.organization, +select /*+ COALESCE(100) */ cast(allpubsshare.year as int) year, allpubsshare.organization, (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare @@ -617,7 +617,7 @@ DROP VIEW allsoftwaresshare; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as -select distinct p.id, coalesce(has_preprint, 0) as has_preprint +select /*+ COALESCE(100) */ distinct p.id, coalesce(has_preprint, 0) as has_preprint from ${stats_db_name}.publication_classifications p left outer join ( select p.id, 1 as has_preprint @@ -627,7 +627,7 @@ from ${stats_db_name}.publication_classifications p drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as -select distinct p.id, coalesce(is_subscription, 0) as is_subscription +select /*+ COALESCE(100) */ distinct p.id, coalesce(is_subscription, 0) as is_subscription from ${stats_db_name}.publication p left outer join( select p.id, 1 as is_subscription from ${stats_db_name}.publication p @@ -640,7 +640,7 @@ from ${stats_db_name}.publication p drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as -select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid +select /*+ COALESCE(100) */ distinct p.id, coalesce(result_with_pid, 0) as result_with_pid from ${stats_db_name}.result p left outer join ( select p.id, 1 as result_with_pid @@ -654,7 +654,7 @@ group by rf.id; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as -select distinct p.id as id, coalesce(is_interdisciplinary, 0) +select /*+ COALESCE(100) */ distinct p.id as id, coalesce(is_interdisciplinary, 0) as is_interdisciplinary from pub_fos_totals p left outer join ( @@ -666,7 +666,7 @@ drop view pub_fos_totals; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/ create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as -select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa +select /*+ COALESCE(100) */ distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa from ${stats_db_name}.publication p left outer join ( select p.id, 1 as is_bronze_oa @@ -689,7 +689,7 @@ where p.end_year is NOT NULL and r.year is not null; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as -select pry.project_id, pry.acronym, pry.result_id, +select /*+ COALESCE(100) */ pry.project_id, pry.acronym, pry.result_id, coalesce(is_project_result_after, 0) as is_project_result_after from project_year_result_year pry left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after @@ -701,7 +701,7 @@ drop view project_year_result_year; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as -select distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s +select /*+ COALESCE(100) */ distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s from ${stats_db_name}.funder f left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp @@ -722,7 +722,7 @@ create table if not exists ${stats_db_name}.indi_funder_fairness stored as parqu join ${stats_db_name}.project p on p.id=rp.project where cast(year as int)>2003 group by p.funder) -select allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness +select /*+ COALESCE(100) */ allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness from allresults join result_fair on result_fair.funder=allresults.funder; /*EOS*/ @@ -745,7 +745,7 @@ allresults as join ${stats_db_name}.result r on r.id=rc.id where cast(year as int)>2003 group by rc.ri_initiative) -select allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness +select /*+ COALESCE(100) */ allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness from allresults join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/ @@ -817,16 +817,14 @@ select software_oa.funder, software_oa.no_oasoftware/allsoftware.no_allsoftware drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as -select allpubsshare.funder, - (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) - +(case when d is null then 0 else 1 end)) - funder_openess FROM allpubsshare - left outer join (select funder,d from - alldatasetssshare) tmp1 - on tmp1.funder=allpubsshare.funder - left outer join (select funder,s from - allsoftwaresshare) tmp2 - on tmp2.funder=allpubsshare.funder; /*EOS*/ +select /*+ COALESCE(100) */ allpubsshare.funder, + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) + +(case when d is null then 0 else 1 end)) funder_openess +FROM allpubsshare + left outer join (select funder,d from alldatasetssshare) tmp1 + on tmp1.funder=allpubsshare.funder + left outer join (select funder,s from allsoftwaresshare) tmp2 + on tmp2.funder=allpubsshare.funder; /*EOS*/ DROP VIEW pubs_oa; /*EOS*/ DROP VIEW datasets_oa; /*EOS*/ @@ -905,7 +903,7 @@ select software_oa.ri_initiative, software_oa.no_oasoftware/allsoftware.no_allso drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as -select allpubsshare.ri_initiative, +select /*+ COALESCE(100) */ allpubsshare.ri_initiative, (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) ris_openess FROM allpubsshare @@ -943,7 +941,7 @@ with result_findable as join ${stats_db_name}.project p on p.id=rp.project where cast(year as int)>2003 group by p.funder) -select allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable +select /*+ COALESCE(100) */ allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable from allresults join result_findable on result_findable.funder=allresults.funder; /*EOS*/ @@ -952,22 +950,22 @@ drop table if exists ${stats_db_name}.indi_ris_findable purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as with result_contexts as -(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc -join ${stats_db_name}.concept on concept.id=rc.concept -join ${stats_db_name}.category on category.id=concept.category -join ${stats_db_name}.context on context.id=category.context), -result_findable as - (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc - join ${stats_db_name}.result r on r.id=rc.id - join ${stats_db_name}.result_pids rp on rp.id=r.id - where cast(r.year as int)>2003 - group by rc.ri_initiative), -allresults as -(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc - join ${stats_db_name}.result r on r.id=rc.id - where cast(r.year as int)>2003 - group by rc.ri_initiative) -select allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable + (select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc + join ${stats_db_name}.concept on concept.id=rc.concept + join ${stats_db_name}.category on category.id=concept.category + join ${stats_db_name}.context on context.id=category.context), + result_findable as + (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc + join ${stats_db_name}.result r on r.id=rc.id + join ${stats_db_name}.result_pids rp on rp.id=r.id + where cast(r.year as int)>2003 + group by rc.ri_initiative), + allresults as + (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc + join ${stats_db_name}.result r on r.id=rc.id + where cast(r.year as int)>2003 + group by rc.ri_initiative) +select /*+ COALESCE(100) */ allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable from allresults join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/ @@ -975,20 +973,20 @@ drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/ create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as with org_names_pids as -(select org.id,name, pid from ${stats_db_name}.organization org -join ${stats_db_name}.organization_pids op on org.id=op.id), -publicly_funded_orgs as -(select distinct name from -(select pf.name from stats_ext.insitutions_for_publicly_funded pf -join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government' -union all -select pf.name from stats_ext.insitutions_for_publicly_funded pf -join ${stats_db_name}.project p on p.funder=pf.name -union all -select op.name from stats_ext.insitutions_for_publicly_funded pf -join org_names_pids op on (op.name=pf.name or op.pid=pf.ror) -and pf.publicly_funded='yes') foo) -select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded + (select org.id,name, pid from ${stats_db_name}.organization org + join ${stats_db_name}.organization_pids op on org.id=op.id), + publicly_funded_orgs as + (select distinct name from + (select pf.name from stats_ext.insitutions_for_publicly_funded pf + join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government' + union all + select pf.name from stats_ext.insitutions_for_publicly_funded pf + join ${stats_db_name}.project p on p.funder=pf.name + union all + select op.name from stats_ext.insitutions_for_publicly_funded pf + join org_names_pids op on (op.name=pf.name or op.pid=pf.ror) + and pf.publicly_funded='yes') foo) +select /*+ COALESCE(100) */ distinct p.id, coalesce(publicly_funded, 0) as publicly_funded from ${stats_db_name}.publication p left outer join ( select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro @@ -997,7 +995,7 @@ join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/ create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as -select distinct p.id, coalesce(green_with_license, 0) as green_with_license +select /*+ COALESCE(100) */ distinct p.id, coalesce(green_with_license, 0) as green_with_license from ${stats_db_name}.publication p left outer join ( select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p @@ -1008,7 +1006,7 @@ left outer join ( drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ create table ${stats_db_name}.result_country stored as parquet as -select distinct id, country +select /*+ COALESCE(100) */ distinct id, country from ( select ro.id, o.country from ${stats_db_name}.result_organization ro @@ -1023,7 +1021,7 @@ where rc.country is not null; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as -select distinct r.id, coalesce(oa_with_license,0) as oa_with_license +select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_with_license,0) as oa_with_license from ${stats_db_name}.result r left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/ @@ -1031,9 +1029,9 @@ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/ create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as with without_license as -(select distinct id from ${stats_db_name}.indi_result_oa_with_license -where oa_with_license=0) -select distinct r.id, coalesce(oa_without_license,0) as oa_without_license + (select distinct id from ${stats_db_name}.indi_result_oa_with_license + where oa_with_license=0) +select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_without_license,0) as oa_without_license from ${stats_db_name}.result r left outer join (select distinct r.id, 1 as oa_without_license from ${stats_db_name}.result r @@ -1044,7 +1042,7 @@ drop table if exists ${stats_db_name}.indi_result_under_transformative purge; /* create table ${stats_db_name}.indi_result_under_transformative stored as parquet as with transformative_dois as ( select distinct doi from stats_ext.transformative_facts) -select distinct r.id, coalesce(under_transformative,0) as under_transformative +select /*+ COALESCE(100) */ distinct r.id, coalesce(under_transformative,0) as under_transformative from ${stats_db_name}.result r left outer join ( select distinct rp.id, 1 as under_transformative diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 0da4394c84..80256e2dfc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -8,7 +8,7 @@ set mapred.job.queue.name=analytics; /*EOS*/ drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as -select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed +select /*+ COALESCE(100) */ r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/ @@ -17,7 +17,7 @@ left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/ drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as -select r.id, case when green.green_oa=1 then true else false end as green +select /*+ COALESCE(100) */ r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/ @@ -25,6 +25,6 @@ left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/ create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as -select r.id, case when gold.is_gold=1 then true else false end as gold +select /*+ COALESCE(100) */ r.id, case when gold.is_gold=1 then true else false end as gold from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index 1a7f34e96d..85d90eaf1f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -1,7 +1,7 @@ set mapred.job.queue.name=analytics; /*EOS*/ create table ${observatory_db_name}.result_cc_licence stored as parquet as -select r.id, coalesce(rln.count, 0) > 0 as cc_licence +select /*+ COALESCE(100) */ r.id, coalesce(rln.count, 0) > 0 as cc_licence from ${stats_db_name}.result r left outer join ( select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count @@ -11,7 +11,7 @@ from ${stats_db_name}.result r create table ${observatory_db_name}.result_affiliated_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -41,7 +41,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end create table ${observatory_db_name}.result_affiliated_year stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -71,7 +71,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -101,7 +101,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -132,7 +132,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -163,7 +163,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_organization stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -192,7 +192,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -221,7 +221,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_affiliated_funder stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -252,7 +252,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/ create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -283,7 +283,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -314,7 +314,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_year stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -346,7 +346,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end create table ${observatory_db_name}.result_deposited_year_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -377,7 +377,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_datasource stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -408,7 +408,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -439,7 +439,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_organization stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -470,7 +470,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -501,7 +501,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/ create table ${observatory_db_name}.result_deposited_funder stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, @@ -534,7 +534,7 @@ group by r.green, r.gold, case when rl.type is not null then true else false end cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/ create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as -select +select /*+ COALESCE(100) */ count(distinct r.id) as total, r.green, r.gold, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 9a3995a8fc..98225af149 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -39,21 +39,21 @@ where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = f DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS -SELECT substr(d.id, 4) AS id, langs.languages AS language +SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS -SELECT substr(d.id, 4) AS id, oids.ids AS oid +SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/ CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization +SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/ @@ -62,10 +62,10 @@ WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = f DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/ create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS -select substr(d.id, 4) as id, substr(cf.key, 4) as datasource +select /*+ COALESCE(100) */ substr(d.id, 4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/ CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS -SELECT /*+ COALESCE(100) */ datasource AS id, id AS result +SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; /*EOS*/ From e9686365a21dbcf4fe14e9c8ea211d4db62fe39d Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 3 Jul 2024 20:24:36 +0300 Subject: [PATCH 14/19] Improve performance of creating the "result_fos" table, by using a temp-table to cache data, which is requested multiple times. --- .../dhp/oa/graph/stats/oozie_app/scripts/step7.sql | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 0717d7897b..bffd59ef1a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -125,14 +125,20 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; /*EOS*/ + +DROP TABLE IF EXISTS ${stats_db_name}.result_fos_base_tmp purge; /*EOS*/ + +create table ${stats_db_name}.result_fos_base_tmp stored as parquet as +select /*+ COALESCE(100) */ id, topic from ${stats_db_name}.result_topics where type='Fields of Science and Technology classification'; /*EOS*/ + DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; /*EOS*/ create table ${stats_db_name}.result_fos stored as parquet as with - lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), - lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), - lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'), - lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification') + lvl1 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '__ %'), + lvl2 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '____ %'), + lvl3 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '______ %'), + lvl4 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '________ %') select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4 from lvl1 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) From 7d2c0a3723f8abaf7fa23cbb5f729db70b8a9c72 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 23 Jul 2024 15:10:17 +0300 Subject: [PATCH 15/19] added new institutions --- .../scripts/updateMonitorDB_institutions.sql | 12 +++++++++++- .../scripts/step20-createMonitorDB_institutions.sql | 10 ++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql index 5ab8c88b58..a3f29a9e38 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB_institutions.sql @@ -61,7 +61,17 @@ create table TARGET.result stored as parquet as 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna - 'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden + 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden + 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna + 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology + 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University + 'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears + 'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra + 'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University + 'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management + 'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII + 'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment ))) foo; --ANALYZE TABLE TARGET.result COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql index 62c68c625b..1326979d8d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB_institutions.sql @@ -65,5 +65,11 @@ create table TARGET.result stored as parquet as 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University - 'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications - ))) foo; \ No newline at end of file + 'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears + 'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra + 'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University + 'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management + 'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII + 'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment + ))) foo; \ No newline at end of file From d0590e0e4994834a3db618547735e0e080bb2b02 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 23 Jul 2024 15:17:15 +0300 Subject: [PATCH 16/19] added latest institutions --- .../oozie_app/scripts/updateMonitorDBAll.sql | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql index 35ab420298..064b5425b3 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDBAll.sql @@ -81,7 +81,17 @@ create table TARGET.result stored as parquet as 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna - 'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden + 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden + 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna + 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology + 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University + 'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears + 'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra + 'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University + 'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management + 'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII + 'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment ))) foo; --ANALYZE TABLE TARGET.result COMPUTE STATISTICS; From b64c144abfad208e7c0c5137adce3f1fc555706f Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Thu, 5 Sep 2024 16:00:09 +0300 Subject: [PATCH 17/19] added new institutions --- .../scripts/step20-createMonitorDBAll.sql | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index a8392b2267..ca5b0bb90a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -81,11 +81,17 @@ create table TARGET.result stored as parquet as 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna - 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden - 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna + 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden + 'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna 'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology - 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University - 'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University + 'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications + 'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears + 'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra + 'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University + 'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management + 'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII + 'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment ) )) foo; create view if not exists TARGET.category as select * from SOURCE.category; From 37ad259296c686ce3adee1758a2dbbdab9f35ab9 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Thu, 5 Sep 2024 16:02:44 +0300 Subject: [PATCH 18/19] cleanup --- .../oozie_app/scripts/updateMonitorDB.sql | 76 ------------------- 1 file changed, 76 deletions(-) diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql index 321fba87ae..ede8a18bf0 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/scripts/updateMonitorDB.sql @@ -1,79 +1,3 @@ ---drop database if exists TARGET cascade; ---create database if not exists TARGET; --- ---create view if not exists TARGET.category as select * from SOURCE.category; ---create view if not exists TARGET.concept as select * from SOURCE.concept; ---create view if not exists TARGET.context as select * from SOURCE.context; ---create view if not exists TARGET.country as select * from SOURCE.country; ---create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; ---create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; ---create view if not exists TARGET.funder as select * from SOURCE.funder; ---create view if not exists TARGET.fundref as select * from SOURCE.fundref; ---create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; ---create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; ---create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; ---create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; ---create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; ---create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; --- ---create table TARGET.result stored as parquet as --- select distinct * from ( --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) --- union all --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) --- union all --- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( --- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" --- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council --- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? --- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University --- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade --- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki --- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho --- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid --- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen --- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens --- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot --- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University --- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark --- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin --- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt --- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven --- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape --- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute --- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University --- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg --- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) --- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr --- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw --- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly --- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete --- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus --- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras --- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki --- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank --- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech --- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University --- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona --- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University --- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia --- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University --- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje --- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan --- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork --- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University --- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech --- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town --- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin --- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology --- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba --- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili --- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University --- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique --- ) )) foo; --- ---ANALYZE TABLE TARGET.result COMPUTE STATISTICS; - create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.context as select * from SOURCE.context; From dbea7a4072e8e0b75dd907d173a6cf067f4fa7d6 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 23 Sep 2024 14:57:11 +0300 Subject: [PATCH 19/19] removed duplicate line --- .../graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql index ca5b0bb90a..447fe3fb94 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDBAll.sql @@ -262,7 +262,6 @@ create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * f create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); -create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);